From 80d98084fa584510838598497208d4f149da3fe6 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 28 Apr 2026 09:50:25 -0500 Subject: [PATCH 01/16] srt-slurm: upstream recipes and add first-class `recipe:` field Recipes referenced from NVIDIA/srt-slurm@sa-submission-q2-2026 are now tracked under benchmarks/multi_node/srt-slurm-recipes/, mirroring the upstream `recipes/` layout. The master-yaml plumbing for selecting one is hoisted out of `prefill.additional-settings: ["CONFIG_FILE=recipes/..."]` into a first-class `recipe:` field on the search-space entry, validated against on-disk paths so unknown recipes fail fast at sweep generation. The benchmark template resolves it to an absolute scratch-copy path passed to launchers as CONFIG_FILE, so launcher behavior is unchanged otherwise. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/CONFIGS.md | 26 + .github/configs/nvidia-master.yaml | 1512 +++++------------ .../workflows/benchmark-multinode-tmpl.yml | 23 + .github/workflows/e2e-tests.yml | 2 + .github/workflows/run-sweep.yml | 2 + .../srt-slurm-recipes/b200-fp4/1k1k.yaml | 259 +++ .../srt-slurm-recipes/b200-fp4/8k1k.yaml | 351 ++++ .../srt-slurm-recipes/b200-fp8/1k1k.yaml | 281 +++ .../b200-fp8/8k1k_mtp_lowlat_0.yaml | 141 ++ .../b200-fp8/8k1k_mtp_lowlat_1.yaml | 141 ++ .../b200-fp8/8k1k_mtp_lowlat_2.yaml | 141 ++ .../b200-fp8/8k1k_mtp_maxtpt_0.yaml | 144 ++ .../b200-fp8/8k1k_mtp_maxtpt_1.yaml | 144 ++ .../b200-fp8/8k1k_mtp_maxtpt_2.yaml | 144 ++ .../b200-fp8/8k1k_mtp_maxtpt_3.yaml | 144 ++ .../b200-fp8/8k1k_stp_lowlat_0.yaml | 139 ++ .../b200-fp8/8k1k_stp_lowlat_1.yaml | 139 ++ .../b200-fp8/8k1k_stp_lowlat_2.yaml | 139 ++ .../b200-fp8/8k1k_stp_maxtpt_0.yaml | 140 ++ .../b200-fp8/8k1k_stp_maxtpt_1.yaml | 140 ++ .../b200-fp8/8k1k_stp_maxtpt_2.yaml | 140 ++ .../b200-fp8/8k1k_stp_maxtpt_3.yaml | 140 ++ .../gb200-fp4/1k1k/low-latency.yaml | 116 ++ .../gb200-fp4/1k1k/max-tpt.yaml | 183 ++ .../gb200-fp4/1k1k/mid-curve.yaml | 182 ++ .../gb200-fp4/8k1k/low-latency.yaml | 118 ++ .../gb200-fp4/8k1k/max-tpt.yaml | 179 ++ .../gb200-fp4/8k1k/mid-curve.yaml | 179 ++ .../gb200-fp8/1k1k/low-latency.yaml | 121 ++ .../gb200-fp8/1k1k/max-tpt.yaml | 175 ++ .../gb200-fp8/1k1k/mid-curve.yaml | 174 ++ .../gb200-fp8/1k1k/ultra-tpt.yaml | 176 ++ .../gb200-fp8/8k1k/low-latency.yaml | 117 ++ .../gb200-fp8/8k1k/max_tpt.yaml | 171 ++ .../gb200-fp8/8k1k/mid-curve.yaml | 170 ++ .../gb300-fp4/1k1k/low_latency.yaml | 116 ++ .../gb300-fp4/1k1k/max_tpt.yaml | 184 ++ .../gb300-fp4/1k1k/mid_curve.yaml | 182 ++ .../gb300-fp4/8k1k/low_latency.yaml | 119 ++ .../gb300-fp4/8k1k/max_tpt.yaml | 179 ++ .../gb300-fp4/8k1k/mid_curve.yaml | 179 ++ .../gb300-fp8/1k1k/stp/low-latency.yaml | 122 ++ .../gb300-fp8/1k1k/stp/max.yaml | 171 ++ .../gb300-fp8/1k1k/stp/mid.yaml | 170 ++ .../gb300-fp8/8k1k/stp/low-latency.yaml | 121 ++ .../gb300-fp8/8k1k/stp/max.yaml | 171 ++ .../gb300-fp8/8k1k/stp/mid.yaml | 171 ++ .../1k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml | 114 ++ .../1k1k/mtp/h100-fp8-1p2d-max-tp-mtp.yaml | 116 ++ .../h100/1k1k/stp/h100-fp8-1p1d-max-dep.yaml | 102 ++ .../h100/1k1k/stp/h100-fp8-1p2d-max-tp.yaml | 102 ++ .../8k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml | 116 ++ .../8k1k/mtp/h100-fp8-1p1d-max-tp-mtp.yaml | 116 ++ .../h100/8k1k/stp/h100-fp8-1p1d-max-dep.yaml | 102 ++ .../h100/8k1k/stp/h100-fp8-1p1d-max-tp.yaml | 102 ++ .../h200/1k1k/bs256-1p6d-dep-mtp.yaml | 121 ++ .../h200/1k1k/bs256-1p6d-dep.yaml | 109 ++ .../h200/1k1k/bs256-1p6d-tp-mtp.yaml | 118 ++ .../h200/1k1k/bs256-1p6d-tp.yaml | 109 ++ .../h200/1k1k/low-latency-1p9d-mtp.yaml | 116 ++ .../h200/1k1k/low-latency-1p9d.yaml | 106 ++ .../h200/8k1k/bs128-1p1d-dep-mtp.yaml | 118 ++ .../h200/8k1k/bs128-1p1d-dep.yaml | 109 ++ .../h200/8k1k/bs16-1p3d-mtp.yaml | 116 ++ .../h200/8k1k/bs16-1p3d.yaml | 107 ++ .../h200/8k1k/bs4-1p7d-mtp.yaml | 116 ++ .../srt-slurm-recipes/h200/8k1k/bs4-1p7d.yaml | 107 ++ .../h200/8k1k/bs64-2p3d-mtp.yaml | 125 ++ .../h200/8k1k/bs64-2p3d.yaml | 115 ++ .../h200/8k1k/bs8-1p6d-mtp.yaml | 117 ++ .../srt-slurm-recipes/h200/8k1k/bs8-1p6d.yaml | 108 ++ ...ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml | 125 ++ ...ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml | 129 ++ ..._gen1dep8_batch768_allconc_eplb0_mtp0.yaml | 217 +++ ..._gen4tep8_batch128_allconc_eplb0_mtp0.yaml | 138 ++ ...p4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml | 122 ++ ...tx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml | 153 ++ ...tx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml | 137 ++ ...4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml | 126 ++ ...p4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml | 123 ++ ...4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml | 126 ++ ...ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml | 124 ++ ...ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml | 126 ++ ..._gen1dep8_batch256_allconc_eplb0_mtp0.yaml | 155 ++ ...tx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml | 138 ++ .../ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml | 119 ++ .../ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml | 117 ++ .../mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml | 112 ++ .../ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml | 126 ++ .../ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml | 120 ++ .../ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml | 126 ++ .../ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml | 117 ++ .../ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml | 114 ++ .../ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml | 112 ++ .../stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml | 106 ++ .../ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml | 127 ++ .../ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml | 116 ++ .../mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml | 116 ++ .../ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml | 123 ++ .../mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml | 112 ++ .../mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml | 119 ++ .../ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml | 120 ++ .../ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml | 124 ++ .../ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml | 119 ++ .../stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml | 107 ++ .../stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml | 120 ++ .../ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml | 115 ++ .../ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml | 118 ++ .../ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml | 111 ++ .../ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml | 114 ++ ...x1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml | 121 ++ ...x1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml | 121 ++ ...x1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml | 121 ++ ...tx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml | 121 ++ .../ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml | 121 ++ .../ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml | 121 ++ .../ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml | 121 ++ .../ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml | 121 ++ ...x1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml | 115 ++ ...tx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml | 115 ++ ...ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml | 115 ++ .../ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml | 115 ++ ...tx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml | 115 ++ ...x2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml | 115 ++ .../ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml | 123 ++ .../ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml | 123 ++ .../ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml | 123 ++ .../ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml | 123 ++ ...ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml | 125 ++ .../ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml | 125 ++ ...x4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml | 125 ++ ...tx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml | 115 ++ ...tx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml | 115 ++ .../ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml | 117 ++ ...ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml | 115 ++ .../ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml | 116 ++ .../ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml | 116 ++ .../ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml | 116 ++ ...tx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml | 115 ++ .../ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml | 127 ++ .../ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml | 125 ++ .../mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml | 123 ++ .../ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml | 139 ++ .../ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml | 129 ++ .../ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml | 130 ++ .../ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml | 131 ++ .../ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml | 121 ++ .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml | 117 ++ .../stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml | 121 ++ .../ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml | 136 ++ .../ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml | 124 ++ .../ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml | 129 ++ .../ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml | 123 ++ .../ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml | 129 ++ .../mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml | 127 ++ .../mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml | 123 ++ .../mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml | 126 ++ .../ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml | 125 ++ .../ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml | 128 ++ .../ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml | 123 ++ .../ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml | 121 ++ .../stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml | 130 ++ .../stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml | 118 ++ .../ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml | 120 ++ .../ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml | 128 ++ .../ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml | 128 ++ ...tx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml | 133 ++ ...x1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml | 133 ++ ...ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml | 133 ++ .../ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml | 134 ++ .../ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml | 134 ++ ...x3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml | 133 ++ ...x1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml | 127 ++ ...tx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml | 127 ++ ...ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml | 127 ++ .../ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml | 128 ++ .../ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml | 128 ++ .../ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml | 128 ++ ...2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml | 127 ++ .../ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml | 133 ++ .../ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml | 134 ++ .../ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml | 134 ++ .../ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml | 134 ++ .../ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml | 133 ++ .../ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml | 133 ++ .../ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml | 128 ++ .../ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml | 128 ++ .../ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml | 127 ++ .../ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml | 127 ++ .../ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml | 128 ++ ...tx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml | 127 ++ ...x7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml | 127 ++ .../ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml | 117 ++ .../mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml | 121 ++ ...ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml | 152 ++ .../ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml | 128 ++ .../ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml | 213 +++ .../ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml | 113 ++ .../ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml | 175 ++ .../ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml | 207 +++ .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml | 110 ++ .../ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml | 125 ++ ...ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml | 146 ++ .../ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml | 119 ++ ...tx11_gen1_dep16_batch256_eplb256_mtp1.yaml | 152 ++ .../mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml | 123 ++ .../ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml | 117 ++ .../ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml | 128 ++ .../ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml | 119 ++ ...tx10_gen1_dep16_batch256_eplb256_mtp0.yaml | 146 ++ .../ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml | 121 ++ .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml | 112 ++ .../ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml | 112 ++ .../ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml | 115 ++ .../ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml | 127 ++ ...x1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml | 127 ++ ...tx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml | 121 ++ ...x1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml | 151 ++ ...x1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml | 183 ++ .../ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml | 120 ++ .../ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml | 120 ++ .../ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml | 121 ++ ...1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml | 129 ++ ...x1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml | 117 ++ ...ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml | 114 ++ ...x1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml | 177 ++ ...x1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml | 209 +++ .../ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml | 114 ++ .../ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml | 115 ++ .../ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml | 120 ++ .../ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml | 120 ++ .../ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml | 119 ++ ...tx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml | 121 ++ ...ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml | 127 ++ ...ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml | 120 ++ ...tx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml | 123 ++ .../ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml | 116 ++ .../ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml | 114 ++ .../ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml | 114 ++ ...ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml | 114 ++ ...tx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml | 117 ++ ...tx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml | 115 ++ ...x5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml | 121 ++ .../mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml | 121 ++ .../ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml | 216 +++ .../mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml | 119 ++ .../mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml | 124 ++ ...ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml | 139 ++ .../ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml | 127 ++ .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml | 113 ++ .../ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml | 127 ++ .../ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml | 117 ++ .../ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml | 241 +++ ...ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml | 149 ++ .../ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml | 122 ++ .../ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml | 123 ++ .../ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml | 151 ++ ...ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml | 131 ++ .../mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml | 122 ++ .../mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml | 119 ++ .../mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml | 122 ++ .../ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml | 119 ++ .../ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml | 120 ++ .../ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml | 146 ++ .../ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml | 129 ++ .../ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml | 117 ++ .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml | 113 ++ .../stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml | 114 ++ .../stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml | 114 ++ .../ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml | 116 ++ .../ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml | 122 ++ ...tx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml | 126 ++ ...ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml | 122 ++ .../ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml | 123 ++ .../ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml | 123 ++ ...2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml | 138 ++ ...tx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml | 124 ++ ...x3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml | 186 ++ .../ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml | 119 ++ .../ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml | 117 ++ .../ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml | 117 ++ ...2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml | 132 ++ ...x2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml | 120 ++ ...x3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml | 180 ++ ...3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml | 212 +++ ...10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml | 130 ++ .../ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml | 123 ++ .../ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml | 123 ++ ...ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml | 123 ++ ...x7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml | 138 ++ ...tx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml | 126 ++ .../ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml | 117 ++ .../ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml | 117 ++ .../ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml | 118 ++ ...tx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml | 120 ++ ...tx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml | 118 ++ ...x7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml | 124 ++ ...x7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml | 148 ++ .../ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml | 105 ++ .../ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml | 109 ++ .../ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml | 101 ++ .../ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml | 114 ++ .../ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml | 100 ++ .../ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml | 98 ++ .../ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml | 98 ++ .../ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml | 102 ++ .../ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml | 99 ++ .../ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml | 97 ++ .../ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml | 99 ++ .../ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml | 95 ++ .../ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml | 96 ++ .../ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml | 94 + .../ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml | 92 + .../ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml | 92 + .../ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml | 93 + .../ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml | 127 ++ .../ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml | 101 ++ .../ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml | 103 ++ .../ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml | 99 ++ .../ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml | 99 ++ .../ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml | 100 ++ .../ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml | 102 ++ .../ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml | 104 ++ .../ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml | 94 + .../ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml | 104 ++ .../ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml | 104 ++ .../ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml | 97 ++ ...28_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml | 107 ++ ...16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml | 137 ++ .../c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml | 117 ++ ...56_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml | 107 ++ ...2_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml | 107 ++ ...4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml | 135 ++ ...12_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml | 153 ++ ...64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml | 137 ++ ...8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml | 107 ++ ...28_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml | 182 ++ ...16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml | 147 ++ .../c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml | 113 ++ ...56_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml | 101 ++ ...32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml | 147 ++ ...c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml | 147 ++ ...12_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml | 182 ++ ...64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml | 147 ++ ...c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml | 147 ++ ...128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml | 117 ++ ...c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml | 117 ++ .../c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml | 117 ++ ...256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml | 117 ++ ...c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml | 117 ++ .../c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml | 117 ++ ...512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml | 117 ++ ...c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml | 117 ++ .../c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml | 117 ++ ...28_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml | 114 ++ ...c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml | 111 ++ .../c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml | 111 ++ ...56_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml | 111 ++ ...32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml | 111 ++ .../c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml | 111 ++ ...12_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml | 111 ++ ...64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml | 111 ++ .../c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml | 111 ++ .../1k1k/disagg-gb200-1p1d-dep4-dep16.yaml | 101 ++ .../1k1k/disagg-gb200-1p4d-dep4-tep4.yaml | 98 ++ .../8k1k/disagg-gb200-1p4d-dep4-tep4.yaml | 98 ++ .../8k1k/disagg-gb200-3p1d-dep4-dep16.yaml | 101 ++ .../8k1k/disagg-gb200-5p1d-dep4-dep8.yaml | 101 ++ .../8k1k/disagg-gb200-6p1d-dep4-dep16.yaml | 101 ++ runners/launch_b200-dgxc.sh | 2 +- runners/launch_b300-nv.sh | 2 +- runners/launch_gb200-nv.sh | 26 +- runners/launch_gb300-nv.sh | 2 +- runners/launch_h100-dgxc-slurm.sh | 2 +- runners/launch_h200-dgxc-slurm.sh | 2 +- utils/matrix_logic/generate_sweep_configs.py | 6 + utils/matrix_logic/validation.py | 33 + 377 files changed, 47030 insertions(+), 1142 deletions(-) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/b200-fp4/1k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/b200-fp4/8k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/b200-fp8/1k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_lowlat_0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_lowlat_1.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_lowlat_2.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_1.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_2.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_lowlat_0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_lowlat_1.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_lowlat_2.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_1.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_2.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/1k1k/low-latency.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/1k1k/max-tpt.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/1k1k/mid-curve.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/8k1k/low-latency.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/8k1k/max-tpt.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/8k1k/mid-curve.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/low-latency.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/max-tpt.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/mid-curve.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/ultra-tpt.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/8k1k/low-latency.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/8k1k/max_tpt.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/8k1k/mid-curve.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/1k1k/low_latency.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/1k1k/max_tpt.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/1k1k/mid_curve.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/8k1k/low_latency.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/8k1k/max_tpt.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/8k1k/mid_curve.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/1k1k/stp/low-latency.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/1k1k/stp/max.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/1k1k/stp/mid.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/8k1k/stp/low-latency.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/8k1k/stp/max.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/8k1k/stp/mid.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/mtp/h100-fp8-1p2d-max-tp-mtp.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/stp/h100-fp8-1p1d-max-dep.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/stp/h100-fp8-1p2d-max-tp.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/mtp/h100-fp8-1p1d-max-tp-mtp.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/stp/h100-fp8-1p1d-max-dep.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/stp/h100-fp8-1p1d-max-tp.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-dep-mtp.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-dep.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-tp-mtp.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-tp.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/low-latency-1p9d-mtp.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/low-latency-1p9d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs128-1p1d-dep-mtp.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs128-1p1d-dep.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs16-1p3d-mtp.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs16-1p3d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs4-1p7d-mtp.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs4-1p7d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs64-2p3d-mtp.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs64-2p3d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs8-1p6d-mtp.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs8-1p6d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml diff --git a/.github/configs/CONFIGS.md b/.github/configs/CONFIGS.md index 9d3c24309..f383f20ba 100644 --- a/.github/configs/CONFIGS.md +++ b/.github/configs/CONFIGS.md @@ -47,6 +47,32 @@ Notes: - No extra fields besides the ones listed may be specified, or else the benchmarks will fail to run. - Setting the fields above, particularly `ep` and `dp-attn`, only guarantee that the respective values will be passed as environment variables to the benchmark scripts! Actually using those environment variables is an implementation detail at the level of the benchmark Bash script. +## Multi-node srt-slurm recipes + +Multi-node configs that dispatch via `srt-slurm` (i.e. `srtctl apply -f …`) reference their recipe as a first-class field on the search-space entry: + +```yaml +search-space: +- spec-decoding: "mtp" + conc-list: [1214] + recipe: "trtllm/b200-fp4/1k1k/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml" + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true +``` + +- `recipe` is a path **relative to `benchmarks/multi_node/srt-slurm-recipes/`** in this repo. The schema validator rejects entries whose recipe file does not exist on disk, so adding a new entry requires upstreaming the recipe yaml here first. +- The path may carry an `:override[N]` / `:override_` suffix to select a named override section inside an sglang-style recipe yaml (e.g. `"b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[0]"`). The launcher strips this suffix before reading the file but passes the full string to `srtctl`. +- `recipe` is optional: multi-node entries that do *not* go through srt-slurm (e.g. dynamo-sglang aggregated topologies that drive their own bash) leave it unset. +- Recipes live under `benchmarks/multi_node/srt-slurm-recipes/` mirroring the upstream NVIDIA/srt-slurm `recipes/` layout (e.g. `trtllm/b200-fp4/...`, `vllm/deepseek-v4/...`, `gb200-fp4/...`). The benchmark template resolves `recipe` to an absolute path and passes it to the launcher as `CONFIG_FILE`, so launchers do not see the relative form. + ## Runners The `runners.yaml` config represents the available runners in the repository. The keys are the runner *types* (i.e., the GPUs as well as some specific combinations like `b200-trt`) whereas the value is a list of *runner nodes*. This config is used to verify the master configs. diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 9e4177ee8..4a03b1c0f 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -13,14 +13,12 @@ dsr1-fp4-b200-dynamo-trt: search-space: - spec-decoding: "mtp" conc-list: [1214] + recipe: "trtllm/b200-fp4/1k1k/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml" decode: num-worker: 2 tp: 8 @@ -28,14 +26,12 @@ dsr1-fp4-b200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [875] + recipe: "trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml" decode: num-worker: 5 tp: 8 @@ -43,14 +39,12 @@ dsr1-fp4-b200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [6] + recipe: "trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml" decode: num-worker: 5 tp: 8 @@ -58,14 +52,12 @@ dsr1-fp4-b200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [10, 15, 25, 45, 90, 180] + recipe: "trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml" decode: num-worker: 5 tp: 8 @@ -73,14 +65,12 @@ dsr1-fp4-b200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [ 4968 ] + recipe: "trtllm/b200-fp4/1k1k/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml" prefill: num-worker: 3 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml" decode: num-worker: 4 tp: 8 @@ -88,14 +78,12 @@ dsr1-fp4-b200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [10860] + recipe: "trtllm/b200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml" prefill: num-worker: 3 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml" decode: num-worker: 5 tp: 4 @@ -104,84 +92,72 @@ dsr1-fp4-b200-dynamo-trt: # Non-MTP configurations - conc-list: [4096] + recipe: "trtllm/b200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 8 ep: 8 dp-attn: true - conc-list: [2192] + recipe: "trtllm/b200-fp4/1k1k/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml" decode: num-worker: 2 tp: 8 ep: 8 dp-attn: true - conc-list: [1365] + recipe: "trtllm/b200-fp4/1k1k/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml" decode: num-worker: 5 tp: 8 ep: 8 dp-attn: true - conc-list: [6] + recipe: "trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml" decode: num-worker: 5 tp: 8 ep: 8 dp-attn: false - conc-list: [10, 15, 25, 45, 90, 180] + recipe: "trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml" decode: num-worker: 5 tp: 8 ep: 8 dp-attn: false - conc-list: [450] + recipe: "trtllm/b200-fp4/1k1k/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml" decode: num-worker: 6 tp: 8 @@ -193,14 +169,12 @@ dsr1-fp4-b200-dynamo-trt: search-space: - spec-decoding: "mtp" conc-list: [90] + recipe: "trtllm/b200-fp4/8k1k/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml" decode: num-worker: 1 tp: 8 @@ -208,14 +182,12 @@ dsr1-fp4-b200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [66] + recipe: "trtllm/b200-fp4/8k1k/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml" decode: num-worker: 3 tp: 8 @@ -223,14 +195,12 @@ dsr1-fp4-b200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [6] + recipe: "trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml" decode: num-worker: 5 tp: 8 @@ -238,14 +208,12 @@ dsr1-fp4-b200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [10, 15, 30, 60] + recipe: "trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml" decode: num-worker: 5 tp: 8 @@ -253,14 +221,12 @@ dsr1-fp4-b200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [548] + recipe: "trtllm/b200-fp4/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml" prefill: num-worker: 3 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml" decode: num-worker: 1 tp: 8 @@ -268,14 +234,12 @@ dsr1-fp4-b200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [1096, 1691] + recipe: "trtllm/b200-fp4/8k1k/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml" prefill: num-worker: 5 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml" decode: num-worker: 1 tp: 8 @@ -283,14 +247,12 @@ dsr1-fp4-b200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [658] + recipe: "trtllm/b200-fp4/8k1k/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml" prefill: num-worker: 5 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml" decode: num-worker: 2 tp: 8 @@ -299,84 +261,72 @@ dsr1-fp4-b200-dynamo-trt: # Non-MTP configurations - conc-list: [6] + recipe: "trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml" decode: num-worker: 5 tp: 8 ep: 8 dp-attn: false - conc-list: [10, 15, 25, 50, 100] + recipe: "trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml" decode: num-worker: 5 tp: 8 ep: 8 dp-attn: false - conc-list: [370] + recipe: "trtllm/b200-fp4/8k1k/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml" decode: num-worker: 5 tp: 8 ep: 8 dp-attn: false - conc-list: [1606] + recipe: "trtllm/b200-fp4/8k1k/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml" prefill: num-worker: 4 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 8 ep: 8 dp-attn: true - conc-list: [837] + recipe: "trtllm/b200-fp4/8k1k/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml" prefill: num-worker: 4 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml" decode: num-worker: 3 tp: 8 ep: 8 dp-attn: true - conc-list: [2222] + recipe: "trtllm/b200-fp4/8k1k/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml" prefill: num-worker: 7 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml" decode: num-worker: 2 tp: 8 @@ -399,14 +349,12 @@ dsr1-fp8-b200-dynamo-trt: # MTP configurations - Low latency (TP attention) - spec-decoding: "mtp" conc-list: [8] + recipe: "trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml" decode: num-worker: 8 tp: 8 @@ -414,14 +362,12 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [32] + recipe: "trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml" decode: num-worker: 8 tp: 8 @@ -429,14 +375,12 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [64] + recipe: "trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml" decode: num-worker: 8 tp: 8 @@ -444,14 +388,12 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [256] + recipe: "trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml" decode: num-worker: 8 tp: 8 @@ -460,14 +402,12 @@ dsr1-fp8-b200-dynamo-trt: # MTP configurations - High throughput (DP attention) - spec-decoding: "mtp" conc-list: [896] + recipe: "trtllm/b200-fp8/1k1k/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml" decode: num-worker: 7 tp: 8 @@ -475,14 +415,12 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [1024] + recipe: "trtllm/b200-fp8/1k1k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml" decode: num-worker: 4 tp: 8 @@ -490,14 +428,12 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [1184] + recipe: "trtllm/b200-fp8/1k1k/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml" decode: num-worker: 3 tp: 8 @@ -505,14 +441,12 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [1600] + recipe: "trtllm/b200-fp8/1k1k/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml" decode: num-worker: 2 tp: 8 @@ -521,42 +455,36 @@ dsr1-fp8-b200-dynamo-trt: # Non-MTP (STP) configurations - Low latency (TP attention) - conc-list: [4] + recipe: "trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml" decode: num-worker: 3 tp: 8 ep: 1 dp-attn: false - conc-list: [32] + recipe: "trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml" decode: num-worker: 3 tp: 8 ep: 1 dp-attn: false - conc-list: [128] + recipe: "trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml" decode: num-worker: 3 tp: 8 @@ -564,42 +492,36 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: false # Non-MTP (STP) configurations - High throughput (DP attention) - conc-list: [1920] + recipe: "trtllm/b200-fp8/1k1k/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml" decode: num-worker: 5 tp: 8 ep: 8 dp-attn: true - conc-list: [4096] + recipe: "trtllm/b200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml" decode: num-worker: 1 tp: 8 ep: 8 dp-attn: true - conc-list: [5152] + recipe: "trtllm/b200-fp8/1k1k/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml" prefill: num-worker: 2 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml" decode: num-worker: 5 tp: 8 @@ -612,14 +534,12 @@ dsr1-fp8-b200-dynamo-trt: # MTP configurations - Low latency (TP attention) - spec-decoding: "mtp" conc-list: [8] + recipe: "trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml" prefill: num-worker: 1 tp: 8 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml" decode: num-worker: 6 tp: 8 @@ -627,14 +547,12 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [8] + recipe: "trtllm/b200-fp8/8k1k/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml" prefill: num-worker: 1 tp: 8 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml" decode: num-worker: 2 tp: 8 @@ -642,14 +560,12 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [48] + recipe: "trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml" prefill: num-worker: 1 tp: 8 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml" decode: num-worker: 6 tp: 8 @@ -657,14 +573,12 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [64] + recipe: "trtllm/b200-fp8/8k1k/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml" prefill: num-worker: 1 tp: 8 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml" decode: num-worker: 4 tp: 8 @@ -673,14 +587,12 @@ dsr1-fp8-b200-dynamo-trt: # MTP configurations - High throughput (DP attention) - spec-decoding: "mtp" conc-list: [224] + recipe: "trtllm/b200-fp8/8k1k/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml" prefill: num-worker: 2 tp: 8 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml" decode: num-worker: 3 tp: 8 @@ -688,14 +600,12 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [288] + recipe: "trtllm/b200-fp8/8k1k/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml" prefill: num-worker: 2 tp: 8 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml" decode: num-worker: 1 tp: 8 @@ -703,14 +613,12 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [1088] + recipe: "trtllm/b200-fp8/8k1k/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml" prefill: num-worker: 4 tp: 8 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml" decode: num-worker: 1 tp: 8 @@ -719,56 +627,48 @@ dsr1-fp8-b200-dynamo-trt: # Non-MTP (STP) configurations - Low latency (TP attention) - conc-list: [1] + recipe: "trtllm/b200-fp8/8k1k/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml" prefill: num-worker: 1 tp: 8 ep: 1 dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml" decode: num-worker: 1 tp: 8 ep: 1 dp-attn: false - conc-list: [32] + recipe: "trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml" prefill: num-worker: 1 tp: 8 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml" decode: num-worker: 4 tp: 8 ep: 1 dp-attn: false - conc-list: [128] + recipe: "trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml" prefill: num-worker: 1 tp: 8 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml" decode: num-worker: 4 tp: 8 ep: 1 dp-attn: false - conc-list: [96] + recipe: "trtllm/b200-fp8/8k1k/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml" prefill: num-worker: 1 tp: 8 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml" decode: num-worker: 6 tp: 8 @@ -776,56 +676,48 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: false # Non-MTP (STP) configurations - High throughput (DP attention) - conc-list: [128] + recipe: "trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml" prefill: num-worker: 1 tp: 8 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml" decode: num-worker: 1 tp: 8 ep: 8 dp-attn: true - conc-list: [128] + recipe: "trtllm/b200-fp8/8k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml" prefill: num-worker: 1 tp: 8 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml" decode: num-worker: 2 tp: 8 ep: 8 dp-attn: true - conc-list: [256] + recipe: "trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml" prefill: num-worker: 1 tp: 8 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml" decode: num-worker: 1 tp: 8 ep: 8 dp-attn: true - conc-list: [640] + recipe: "trtllm/b200-fp8/8k1k/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml" prefill: num-worker: 2 tp: 8 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml" decode: num-worker: 1 tp: 8 @@ -848,14 +740,12 @@ dsr1-fp4-b300-dynamo-trt: search-space: - spec-decoding: "mtp" conc-list: [654] + recipe: "trtllm/b300-fp4/1k1k/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml" decode: num-worker: 1 tp: 8 @@ -863,14 +753,12 @@ dsr1-fp4-b300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [271] + recipe: "trtllm/b300-fp4/1k1k/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml" decode: num-worker: 2 tp: 8 @@ -878,14 +766,12 @@ dsr1-fp4-b300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [11] + recipe: "trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml" decode: num-worker: 5 tp: 8 @@ -893,14 +779,12 @@ dsr1-fp4-b300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [10, 20, 25, 60, 120, 200] + recipe: "trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml" decode: num-worker: 5 tp: 8 @@ -908,14 +792,12 @@ dsr1-fp4-b300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [2342] + recipe: "trtllm/b300-fp4/1k1k/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml" prefill: num-worker: 2 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml" decode: num-worker: 1 tp: 8 @@ -923,14 +805,12 @@ dsr1-fp4-b300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [8609] + recipe: "trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml" prefill: num-worker: 5 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml" decode: num-worker: 2 tp: 8 @@ -938,14 +818,12 @@ dsr1-fp4-b300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [12926] + recipe: "trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml" prefill: num-worker: 5 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml" decode: num-worker: 2 tp: 8 @@ -954,98 +832,84 @@ dsr1-fp4-b300-dynamo-trt: # Non-MTP configurations - conc-list: [1176] + recipe: "trtllm/b300-fp4/1k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml" decode: num-worker: 2 tp: 8 ep: 8 dp-attn: true - conc-list: [6] + recipe: "trtllm/b300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" decode: num-worker: 4 tp: 8 ep: 8 dp-attn: false - conc-list: [5, 10, 15, 25] + recipe: "trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml" decode: num-worker: 5 tp: 4 ep: 4 dp-attn: false - conc-list: [60, 110, 195, 395] + recipe: "trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml" decode: num-worker: 5 tp: 8 ep: 8 dp-attn: false - conc-list: [4405] + recipe: "trtllm/b300-fp4/1k1k/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 8 ep: 8 dp-attn: true - conc-list: [8192] + recipe: "trtllm/b300-fp4/1k1k/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml" prefill: num-worker: 3 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 8 ep: 8 dp-attn: true - conc-list: [4611] + recipe: "trtllm/b300-fp4/1k1k/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml" prefill: num-worker: 3 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml" decode: num-worker: 2 tp: 8 @@ -1057,14 +921,12 @@ dsr1-fp4-b300-dynamo-trt: search-space: - spec-decoding: "mtp" conc-list: [2198] + recipe: "trtllm/b300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml" prefill: num-worker: 10 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml" decode: num-worker: 1 tp: 8 @@ -1072,14 +934,12 @@ dsr1-fp4-b300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [52] + recipe: "trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml" decode: num-worker: 4 tp: 4 @@ -1087,14 +947,12 @@ dsr1-fp4-b300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [8] + recipe: "trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml" decode: num-worker: 4 tp: 8 @@ -1102,14 +960,12 @@ dsr1-fp4-b300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [32] + recipe: "trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml" decode: num-worker: 4 tp: 8 @@ -1117,14 +973,12 @@ dsr1-fp4-b300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [181] + recipe: "trtllm/b300-fp4/8k1k/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml" prefill: num-worker: 3 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml" decode: num-worker: 1 tp: 8 @@ -1132,14 +986,12 @@ dsr1-fp4-b300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [1197] + recipe: "trtllm/b300-fp4/8k1k/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml" prefill: num-worker: 9 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml" decode: num-worker: 1 tp: 8 @@ -1148,98 +1000,84 @@ dsr1-fp4-b300-dynamo-trt: # Non-MTP configurations - conc-list: [105] + recipe: "trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml" decode: num-worker: 3 tp: 4 ep: 4 dp-attn: false - conc-list: [63] + recipe: "trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml" decode: num-worker: 3 tp: 8 ep: 8 dp-attn: false - conc-list: [4] + recipe: "trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml" decode: num-worker: 3 tp: 8 ep: 8 dp-attn: false - conc-list: [12] + recipe: "trtllm/b300-fp4/8k1k/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml" decode: num-worker: 4 tp: 4 ep: 4 dp-attn: false - conc-list: [589] + recipe: "trtllm/b300-fp4/8k1k/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml" prefill: num-worker: 5 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml" decode: num-worker: 2 tp: 8 ep: 8 dp-attn: true - conc-list: [1093] + recipe: "trtllm/b300-fp4/8k1k/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml" prefill: num-worker: 6 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 8 ep: 8 dp-attn: true - conc-list: [2048] + recipe: "trtllm/b300-fp4/8k1k/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml" prefill: num-worker: 8 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 8 @@ -1262,14 +1100,12 @@ dsr1-fp8-b300-dynamo-trt: search-space: - spec-decoding: "mtp" conc-list: [10] + recipe: "trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml" prefill: num-worker: 1 tp: 4 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml" decode: num-worker: 8 tp: 8 @@ -1277,14 +1113,12 @@ dsr1-fp8-b300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [160] + recipe: "trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml" prefill: num-worker: 1 tp: 4 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml" decode: num-worker: 8 tp: 8 @@ -1292,14 +1126,12 @@ dsr1-fp8-b300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [3072] + recipe: "trtllm/b300-fp8/1k1k/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml" prefill: num-worker: 1 tp: 4 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml" decode: num-worker: 1 tp: 8 @@ -1307,14 +1139,12 @@ dsr1-fp8-b300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [2560] + recipe: "trtllm/b300-fp8/1k1k/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml" prefill: num-worker: 1 tp: 4 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml" decode: num-worker: 2 tp: 8 @@ -1322,14 +1152,12 @@ dsr1-fp8-b300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [720] + recipe: "trtllm/b300-fp8/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml" prefill: num-worker: 1 tp: 4 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml" decode: num-worker: 5 tp: 8 @@ -1337,14 +1165,12 @@ dsr1-fp8-b300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [11264] + recipe: "trtllm/b300-fp8/1k1k/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml" prefill: num-worker: 3 tp: 4 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml" decode: num-worker: 2 tp: 8 @@ -1355,98 +1181,84 @@ dsr1-fp8-b300-dynamo-trt: osl: 1024 search-space: - conc-list: [2112] + recipe: "trtllm/b300-fp8/1k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml" prefill: num-worker: 1 tp: 4 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml" decode: num-worker: 1 tp: 8 ep: 8 dp-attn: true - conc-list: [3072] + recipe: "trtllm/b300-fp8/1k1k/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml" prefill: num-worker: 1 tp: 4 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml" decode: num-worker: 2 tp: 8 ep: 1 dp-attn: true - conc-list: [1280] + recipe: "trtllm/b300-fp8/1k1k/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml" prefill: num-worker: 1 tp: 4 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml" decode: num-worker: 3 tp: 8 ep: 1 dp-attn: true - conc-list: [12] + recipe: "trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml" prefill: num-worker: 1 tp: 4 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml" decode: num-worker: 8 tp: 8 ep: 1 dp-attn: false - conc-list: [128] + recipe: "trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml" prefill: num-worker: 1 tp: 4 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml" decode: num-worker: 8 tp: 8 ep: 1 dp-attn: false - conc-list: [384] + recipe: "trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml" prefill: num-worker: 1 tp: 4 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml" decode: num-worker: 8 tp: 8 ep: 1 dp-attn: false - conc-list: [16384] + recipe: "trtllm/b300-fp8/1k1k/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml" prefill: num-worker: 2 tp: 4 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml" decode: num-worker: 1 tp: 8 @@ -1458,14 +1270,12 @@ dsr1-fp8-b300-dynamo-trt: search-space: - spec-decoding: "mtp" conc-list: [40] + recipe: "trtllm/b300-fp8/8k1k/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml" prefill: num-worker: 1 tp: 4 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml" decode: num-worker: 2 tp: 8 @@ -1473,14 +1283,12 @@ dsr1-fp8-b300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [8] + recipe: "trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml" prefill: num-worker: 1 tp: 4 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml" decode: num-worker: 4 tp: 8 @@ -1488,14 +1296,12 @@ dsr1-fp8-b300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [20] + recipe: "trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml" prefill: num-worker: 1 tp: 4 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml" decode: num-worker: 4 tp: 8 @@ -1503,14 +1309,12 @@ dsr1-fp8-b300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [72] + recipe: "trtllm/b300-fp8/8k1k/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml" prefill: num-worker: 1 tp: 4 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml" decode: num-worker: 1 tp: 8 @@ -1518,14 +1322,12 @@ dsr1-fp8-b300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [144] + recipe: "trtllm/b300-fp8/8k1k/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml" prefill: num-worker: 2 tp: 4 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml" decode: num-worker: 1 tp: 8 @@ -1533,14 +1335,12 @@ dsr1-fp8-b300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [512] + recipe: "trtllm/b300-fp8/8k1k/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml" prefill: num-worker: 4 tp: 4 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml" decode: num-worker: 1 tp: 8 @@ -1551,98 +1351,84 @@ dsr1-fp8-b300-dynamo-trt: osl: 1024 search-space: - conc-list: [64] + recipe: "trtllm/b300-fp8/8k1k/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml" prefill: num-worker: 1 tp: 4 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml" decode: num-worker: 4 tp: 8 ep: 1 dp-attn: false - conc-list: [16] + recipe: "trtllm/b300-fp8/8k1k/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml" prefill: num-worker: 1 tp: 4 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml" decode: num-worker: 8 tp: 8 ep: 1 dp-attn: false - conc-list: [256] + recipe: "trtllm/b300-fp8/8k1k/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml" prefill: num-worker: 2 tp: 4 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml" decode: num-worker: 1 tp: 8 ep: 1 dp-attn: true - conc-list: [512] + recipe: "trtllm/b300-fp8/8k1k/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml" prefill: num-worker: 3 tp: 4 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml" decode: num-worker: 1 tp: 8 ep: 1 dp-attn: true - conc-list: [256] + recipe: "trtllm/b300-fp8/8k1k/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml" prefill: num-worker: 3 tp: 4 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml" decode: num-worker: 5 tp: 8 ep: 1 dp-attn: false - conc-list: [1075] + recipe: "trtllm/b300-fp8/8k1k/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml" prefill: num-worker: 5 tp: 4 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml" decode: num-worker: 1 tp: 8 ep: 1 dp-attn: true - conc-list: [3072] + recipe: "trtllm/b300-fp8/8k1k/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml" prefill: num-worker: 7 tp: 4 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml" decode: num-worker: 1 tp: 8 @@ -2654,14 +2440,12 @@ dsr1-fp8-h200-dynamo-trt: # MTP configurations - spec-decoding: "mtp" conc-list: [1] + recipe: "trtllm/h200/1k1k/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml" decode: num-worker: 11 tp: 8 @@ -2669,14 +2453,12 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [4] + recipe: "trtllm/h200/1k1k/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml" decode: num-worker: 11 tp: 8 @@ -2684,14 +2466,12 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [8] + recipe: "trtllm/h200/1k1k/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml" decode: num-worker: 11 tp: 8 @@ -2699,14 +2479,12 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [16] + recipe: "trtllm/h200/1k1k/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml" decode: num-worker: 9 tp: 8 @@ -2714,14 +2492,12 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [32] + recipe: "trtllm/h200/1k1k/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml" decode: num-worker: 11 tp: 8 @@ -2729,14 +2505,12 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [64] + recipe: "trtllm/h200/1k1k/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml" decode: num-worker: 8 tp: 8 @@ -2744,14 +2518,12 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [128] + recipe: "trtllm/h200/1k1k/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml" decode: num-worker: 7 tp: 8 @@ -2759,14 +2531,12 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [256] + recipe: "trtllm/h200/1k1k/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml" decode: num-worker: 4 tp: 8 @@ -2774,14 +2544,12 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [512] + recipe: "trtllm/h200/1k1k/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml" decode: num-worker: 2 tp: 8 @@ -2789,126 +2557,108 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: true # Non-MTP configurations (STP) - conc-list: [1] + recipe: "trtllm/h200/1k1k/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml" decode: num-worker: 9 tp: 8 ep: 8 dp-attn: false - conc-list: [4] + recipe: "trtllm/h200/1k1k/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" decode: num-worker: 9 tp: 8 ep: 8 dp-attn: false - conc-list: [8] + recipe: "trtllm/h200/1k1k/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" decode: num-worker: 9 tp: 8 ep: 8 dp-attn: false - conc-list: [16] + recipe: "trtllm/h200/1k1k/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" decode: num-worker: 9 tp: 8 ep: 8 dp-attn: false - conc-list: [32] + recipe: "trtllm/h200/1k1k/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" decode: num-worker: 9 tp: 8 ep: 8 dp-attn: false - conc-list: [64] + recipe: "trtllm/h200/1k1k/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" decode: num-worker: 9 tp: 8 ep: 8 dp-attn: false - conc-list: [128] + recipe: "trtllm/h200/1k1k/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml" decode: num-worker: 9 tp: 8 ep: 8 dp-attn: true - conc-list: [256] + recipe: "trtllm/h200/1k1k/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml" decode: num-worker: 6 tp: 8 ep: 8 dp-attn: true - conc-list: [512] + recipe: "trtllm/h200/1k1k/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml" decode: num-worker: 7 tp: 8 @@ -2920,14 +2670,12 @@ dsr1-fp8-h200-dynamo-trt: # MTP configurations - spec-decoding: "mtp" conc-list: [1] + recipe: "trtllm/h200/8k1k/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml" decode: num-worker: 7 tp: 8 @@ -2935,14 +2683,12 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [4] + recipe: "trtllm/h200/8k1k/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml" decode: num-worker: 7 tp: 8 @@ -2950,14 +2696,12 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [8] + recipe: "trtllm/h200/8k1k/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml" decode: num-worker: 6 tp: 8 @@ -2965,14 +2709,12 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [16] + recipe: "trtllm/h200/8k1k/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml - - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml" decode: num-worker: 3 tp: 8 @@ -2980,14 +2722,12 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [32] + recipe: "trtllm/h200/8k1k/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml" prefill: num-worker: 3 tp: 8 ep: 8 dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml" decode: num-worker: 5 tp: 8 @@ -2995,14 +2735,12 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [64] + recipe: "trtllm/h200/8k1k/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml - - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml" decode: num-worker: 1 tp: 8 @@ -3010,14 +2748,12 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [128] + recipe: "trtllm/h200/8k1k/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml" prefill: num-worker: 2 tp: 8 ep: 8 dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml - - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml" decode: num-worker: 1 tp: 8 @@ -3025,14 +2761,12 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [256] + recipe: "trtllm/h200/8k1k/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml" prefill: num-worker: 3 tp: 8 ep: 8 dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml - - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml" decode: num-worker: 1 tp: 8 @@ -3040,14 +2774,12 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [512] + recipe: "trtllm/h200/8k1k/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml" prefill: num-worker: 3 tp: 8 ep: 8 dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml" decode: num-worker: 1 tp: 8 @@ -3055,126 +2787,108 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: true # Non-MTP configurations (STP) - conc-list: [1] + recipe: "trtllm/h200/8k1k/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml" decode: num-worker: 7 tp: 8 ep: 8 dp-attn: false - conc-list: [4] + recipe: "trtllm/h200/8k1k/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml" decode: num-worker: 7 tp: 8 ep: 8 dp-attn: false - conc-list: [8] + recipe: "trtllm/h200/8k1k/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml" decode: num-worker: 6 tp: 8 ep: 8 dp-attn: false - conc-list: [16] + recipe: "trtllm/h200/8k1k/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml" decode: num-worker: 3 tp: 8 ep: 8 dp-attn: false - conc-list: [32] + recipe: "trtllm/h200/8k1k/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 8 ep: 8 dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml" decode: num-worker: 5 tp: 8 ep: 8 dp-attn: false - conc-list: [64] + recipe: "trtllm/h200/8k1k/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 8 ep: 8 dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml" decode: num-worker: 3 tp: 8 ep: 8 dp-attn: true - conc-list: [128] + recipe: "trtllm/h200/8k1k/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 8 ep: 8 dp-attn: true - conc-list: [256] + recipe: "trtllm/h200/8k1k/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml" prefill: num-worker: 5 tp: 8 ep: 8 dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml" decode: num-worker: 3 tp: 8 ep: 8 dp-attn: true - conc-list: [512] + recipe: "trtllm/h200/8k1k/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml" prefill: num-worker: 3 tp: 8 ep: 8 dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 8 @@ -3197,14 +2911,12 @@ dsr1-fp8-h100-dynamo-trt: # MTP configurations - spec-decoding: "mtp" conc-list: [6] + recipe: "trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 16 ep: 16 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml" decode: num-worker: 3 tp: 16 @@ -3212,14 +2924,12 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [9] + recipe: "trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 16 ep: 16 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml" decode: num-worker: 3 tp: 16 @@ -3227,14 +2937,12 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [30] + recipe: "trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 16 ep: 16 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml" decode: num-worker: 3 tp: 16 @@ -3242,14 +2950,12 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [60] + recipe: "trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 16 ep: 16 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml" decode: num-worker: 3 tp: 16 @@ -3257,14 +2963,12 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [117] + recipe: "trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 16 ep: 16 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml" decode: num-worker: 3 tp: 16 @@ -3272,14 +2976,12 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [231] + recipe: "trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 16 ep: 16 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml" decode: num-worker: 3 tp: 16 @@ -3287,14 +2989,12 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [462] + recipe: "trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 16 ep: 16 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml" decode: num-worker: 3 tp: 16 @@ -3302,14 +3002,12 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [615] + recipe: "trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml" prefill: num-worker: 1 tp: 16 ep: 16 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml" decode: num-worker: 1 tp: 16 @@ -3317,14 +3015,12 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [1229] + recipe: "trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml" prefill: num-worker: 1 tp: 16 ep: 16 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml" decode: num-worker: 1 tp: 16 @@ -3332,126 +3028,108 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: true # Non-MTP configurations (STP) - conc-list: [6] + recipe: "trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 16 ep: 16 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml" decode: num-worker: 3 tp: 16 ep: 16 dp-attn: false - conc-list: [9] + recipe: "trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 16 ep: 16 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml" decode: num-worker: 3 tp: 16 ep: 16 dp-attn: false - conc-list: [30] + recipe: "trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 16 ep: 16 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml" decode: num-worker: 3 tp: 16 ep: 16 dp-attn: false - conc-list: [60] + recipe: "trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 16 ep: 16 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml" decode: num-worker: 3 tp: 16 ep: 16 dp-attn: false - conc-list: [231] + recipe: "trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 16 ep: 16 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml" decode: num-worker: 3 tp: 16 ep: 16 dp-attn: true - conc-list: [462] + recipe: "trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 16 ep: 16 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml" decode: num-worker: 3 tp: 16 ep: 16 dp-attn: true - conc-list: [924] + recipe: "trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 16 ep: 16 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml" decode: num-worker: 3 tp: 16 ep: 16 dp-attn: true - conc-list: [1845] + recipe: "trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 16 ep: 16 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml" decode: num-worker: 3 tp: 16 ep: 16 dp-attn: true - conc-list: [4916] + recipe: "trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 16 ep: 16 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 16 @@ -3463,14 +3141,12 @@ dsr1-fp8-h100-dynamo-trt: # MTP configurations (6 points) - spec-decoding: "mtp" conc-list: [6] + recipe: "trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 16 ep: 16 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml" decode: num-worker: 3 tp: 16 @@ -3478,14 +3154,12 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [9] + recipe: "trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 16 ep: 16 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml" decode: num-worker: 3 tp: 16 @@ -3493,14 +3167,12 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [30] + recipe: "trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 16 ep: 16 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml" decode: num-worker: 3 tp: 16 @@ -3508,14 +3180,12 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [77] + recipe: "trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 16 ep: 16 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml" decode: num-worker: 1 tp: 16 @@ -3525,14 +3195,12 @@ dsr1-fp8-h100-dynamo-trt: # https://github.com/InferenceMAX/InferenceMAX/actions/runs/21769314582/job/62813105509 # - spec-decoding: "mtp" # conc-list: [78] + # recipe: "trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml" # prefill: # num-worker: 1 # tp: 16 # ep: 16 # dp-attn: true - # additional-settings: - # # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml - # - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml" # decode: # num-worker: 2 # tp: 16 @@ -3540,14 +3208,12 @@ dsr1-fp8-h100-dynamo-trt: # dp-attn: false - spec-decoding: "mtp" conc-list: [154] + recipe: "trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml" prefill: num-worker: 2 tp: 16 ep: 16 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml" decode: num-worker: 1 tp: 16 @@ -3555,70 +3221,60 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: true # STP configurations (5 points) - conc-list: [6] + recipe: "trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 16 ep: 16 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml" decode: num-worker: 3 tp: 16 ep: 16 dp-attn: false - conc-list: [9] + recipe: "trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 16 ep: 16 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml" decode: num-worker: 3 tp: 16 ep: 16 dp-attn: false - conc-list: [30] + recipe: "trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 16 ep: 16 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml" decode: num-worker: 3 tp: 16 ep: 16 dp-attn: false - conc-list: [154] + recipe: "trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 16 ep: 16 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml" decode: num-worker: 2 tp: 16 ep: 16 dp-attn: false - conc-list: [308] + recipe: "trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 16 ep: 16 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 16 @@ -3838,13 +3494,12 @@ dsr1-fp8-h100-dynamo-sglang: search-space: # # STP: Max throughput TEP (1 prefill, 2 decode) # - conc-list: [1, 2, 4, 8, 16, 32, 64, 128] + # recipe: "h100/1k1k/stp/h100-fp8-1p2d-max-tp.yaml" # prefill: # num-worker: 1 # tp: 16 # ep: 1 # dp-attn: false - # additional-settings: - # - "CONFIG_FILE=recipes/h100/1k1k/stp/h100-fp8-1p2d-max-tp.yaml" # decode: # num-worker: 2 # tp: 16 @@ -3852,13 +3507,12 @@ dsr1-fp8-h100-dynamo-sglang: # dp-attn: false # # STP: Max throughput DEP (1 prefill, 1 decode, dp-attention) # - conc-list: [1, 2, 4, 8, 16, 32, 64] + # recipe: "h100/1k1k/stp/h100-fp8-1p1d-max-dep.yaml" # prefill: # num-worker: 1 # tp: 16 # ep: 1 # dp-attn: false - # additional-settings: - # - "CONFIG_FILE=recipes/h100/1k1k/stp/h100-fp8-1p1d-max-dep.yaml" # decode: # num-worker: 1 # tp: 16 @@ -3867,13 +3521,12 @@ dsr1-fp8-h100-dynamo-sglang: # MTP: Max throughput TEP (1 prefill, 2 decode) - spec-decoding: "mtp" conc-list: [1, 2, 4, 8, 16, 32, 64, 128] + recipe: "h100/1k1k/mtp/h100-fp8-1p2d-max-tp-mtp.yaml" prefill: num-worker: 1 tp: 16 ep: 1 dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h100/1k1k/mtp/h100-fp8-1p2d-max-tp-mtp.yaml" decode: num-worker: 2 tp: 16 @@ -3882,13 +3535,12 @@ dsr1-fp8-h100-dynamo-sglang: # MTP: Max throughput DEP (1 prefill, 1 decode, dp-attention) - spec-decoding: "mtp" conc-list: [1, 2, 4, 8, 16, 32, 64] + recipe: "h100/1k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml" prefill: num-worker: 1 tp: 16 ep: 1 dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h100/1k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml" decode: num-worker: 1 tp: 16 @@ -3899,13 +3551,12 @@ dsr1-fp8-h100-dynamo-sglang: search-space: # # STP: Max throughput TEP (1 prefill, 1 decode) # - conc-list: [1, 2, 4, 8, 16, 32, 64, 128] + # recipe: "h100/8k1k/stp/h100-fp8-1p1d-max-tp.yaml" # prefill: # num-worker: 1 # tp: 16 # ep: 1 # dp-attn: false - # additional-settings: - # - "CONFIG_FILE=recipes/h100/8k1k/stp/h100-fp8-1p1d-max-tp.yaml" # decode: # num-worker: 1 # tp: 16 @@ -3913,13 +3564,12 @@ dsr1-fp8-h100-dynamo-sglang: # dp-attn: false # # STP: Max throughput DEP (1 prefill, 1 decode, dp-attention) # - conc-list: [1, 2, 4, 8, 16, 32, 64] + # recipe: "h100/8k1k/stp/h100-fp8-1p1d-max-dep.yaml" # prefill: # num-worker: 1 # tp: 16 # ep: 1 # dp-attn: false - # additional-settings: - # - "CONFIG_FILE=recipes/h100/8k1k/stp/h100-fp8-1p1d-max-dep.yaml" # decode: # num-worker: 1 # tp: 16 @@ -3928,13 +3578,12 @@ dsr1-fp8-h100-dynamo-sglang: # MTP: Max throughput TEP (1 prefill, 1 decode) - spec-decoding: "mtp" conc-list: [1, 2, 4, 8, 16, 32, 64, 128] + recipe: "h100/8k1k/mtp/h100-fp8-1p1d-max-tp-mtp.yaml" prefill: num-worker: 1 tp: 16 ep: 1 dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h100/8k1k/mtp/h100-fp8-1p1d-max-tp-mtp.yaml" decode: num-worker: 1 tp: 16 @@ -3943,13 +3592,12 @@ dsr1-fp8-h100-dynamo-sglang: # MTP: Max throughput DEP (1 prefill, 1 decode, dp-attention) - spec-decoding: "mtp" conc-list: [1, 2, 4, 8, 16, 32, 64] + recipe: "h100/8k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml" prefill: num-worker: 1 tp: 16 ep: 1 dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h100/8k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml" decode: num-worker: 1 tp: 16 @@ -4039,14 +3687,12 @@ dsr1-fp4-gb200-dynamo-trt: # MTP configurations (spec_decoding="mtp") - spec-decoding: "mtp" conc-list: [ 180 ] + recipe: "trtllm/gb200-fp4/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml" decode: num-worker: 1 tp: 32 @@ -4054,14 +3700,12 @@ dsr1-fp4-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [ 4, 8, 12, 24, 48 ] + recipe: "trtllm/gb200-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" decode: num-worker: 4 tp: 8 @@ -4069,14 +3713,12 @@ dsr1-fp4-gb200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [ 4301 ] + recipe: "trtllm/gb200-fp4/1k1k/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml" prefill: num-worker: 2 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml" decode: num-worker: 1 tp: 16 @@ -4084,14 +3726,12 @@ dsr1-fp4-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [ 2253 ] + recipe: "trtllm/gb200-fp4/1k1k/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml" prefill: num-worker: 3 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml" decode: num-worker: 1 tp: 32 @@ -4099,14 +3739,12 @@ dsr1-fp4-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [ 16130 ] + recipe: "trtllm/gb200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml" prefill: num-worker: 3 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml" decode: num-worker: 5 tp: 4 @@ -4116,98 +3754,84 @@ dsr1-fp4-gb200-dynamo-trt: # Non-MTP configurations (default spec_decoding="none") - conc-list: [ 4301 ] + recipe: "trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 8 ep: 8 dp-attn: true - conc-list: [ 666 ] + recipe: "trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 32 ep: 32 dp-attn: true - conc-list: [ 6144 ] + recipe: "trtllm/gb200-fp4/1k1k/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml" decode: num-worker: 2 tp: 4 ep: 4 dp-attn: true - conc-list: [ 12, 24, 48, 96, 192 ] + recipe: "trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml" decode: num-worker: 4 tp: 8 ep: 8 dp-attn: false - conc-list: [ 5 ] + recipe: "trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" decode: num-worker: 4 tp: 8 ep: 8 dp-attn: false - conc-list: [ 4301 ] + recipe: "trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml" prefill: num-worker: 2 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml" decode: num-worker: 1 tp: 16 ep: 16 dp-attn: true - conc-list: [ 2253 ] + recipe: "trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 32 @@ -4220,14 +3844,12 @@ dsr1-fp4-gb200-dynamo-trt: # MTP configurations (spec_decoding="mtp") - spec-decoding: "mtp" conc-list: [ 4, 8, 12, 24, 48 ] + recipe: "trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" decode: num-worker: 4 tp: 8 @@ -4235,14 +3857,12 @@ dsr1-fp4-gb200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [ 180 ] + recipe: "trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml" prefill: num-worker: 3 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml" decode: num-worker: 1 tp: 32 @@ -4250,14 +3870,12 @@ dsr1-fp4-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [ 1229 ] + recipe: "trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml" prefill: num-worker: 7 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml" decode: num-worker: 1 tp: 16 @@ -4265,14 +3883,12 @@ dsr1-fp4-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [ 666 ] + recipe: "trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml" prefill: num-worker: 8 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml" decode: num-worker: 1 tp: 32 @@ -4280,14 +3896,12 @@ dsr1-fp4-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [ 4301 ] + recipe: "trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml" prefill: num-worker: 11 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml" decode: num-worker: 1 tp: 16 @@ -4296,84 +3910,72 @@ dsr1-fp4-gb200-dynamo-trt: # Non-MTP configurations (default spec_decoding="none") - conc-list: [ 12, 44, 76 ] + recipe: "trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml" decode: num-worker: 4 tp: 8 ep: 8 dp-attn: false - conc-list: [ 5 ] + recipe: "trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" decode: num-worker: 4 tp: 8 ep: 8 dp-attn: false - conc-list: [ 333 ] + recipe: "trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 32 ep: 32 dp-attn: true - conc-list: [ 1229 ] + recipe: "trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml" prefill: num-worker: 7 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 32 ep: 32 dp-attn: true - conc-list: [ 2253 ] + recipe: "trtllm/gb200-fp4/8k1k/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml" prefill: num-worker: 8 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 16 ep: 16 dp-attn: true - conc-list: [ 4096 ] + recipe: "trtllm/gb200-fp4/8k1k/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml" prefill: num-worker: 10 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml" decode: num-worker: 1 tp: 16 @@ -4397,14 +3999,12 @@ dsr1-fp8-gb200-dynamo-trt: search-space: - spec-decoding: "mtp" conc-list: [4301] + recipe: "trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml" decode: num-worker: 1 tp: 8 @@ -4412,14 +4012,12 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [2151] + recipe: "trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml" decode: num-worker: 1 tp: 8 @@ -4427,14 +4025,12 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [1229] + recipe: "trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml" decode: num-worker: 1 tp: 16 @@ -4442,14 +4038,12 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [615] + recipe: "trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml" decode: num-worker: 1 tp: 32 @@ -4457,14 +4051,12 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [36] + recipe: "trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml" decode: num-worker: 3 tp: 8 @@ -4472,14 +4064,12 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [18] + recipe: "trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml" decode: num-worker: 3 tp: 8 @@ -4487,14 +4077,12 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [9] + recipe: "trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml" decode: num-worker: 3 tp: 8 @@ -4502,98 +4090,84 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: false # 1k1k STP configs - conc-list: [6144] + recipe: "trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml" decode: num-worker: 1 tp: 8 ep: 8 dp-attn: true - conc-list: [4301] + recipe: "trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml" decode: num-worker: 1 tp: 8 ep: 8 dp-attn: true - conc-list: [2151] + recipe: "trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml" decode: num-worker: 1 tp: 16 ep: 16 dp-attn: true - conc-list: [1127] + recipe: "trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml" decode: num-worker: 1 tp: 32 ep: 32 dp-attn: true - conc-list: [256] + recipe: "trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml" decode: num-worker: 1 tp: 32 ep: 32 dp-attn: true - conc-list: [27] + recipe: "trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml" decode: num-worker: 3 tp: 8 ep: 8 dp-attn: false - conc-list: [3] + recipe: "trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml" decode: num-worker: 3 tp: 8 @@ -4605,14 +4179,12 @@ dsr1-fp8-gb200-dynamo-trt: search-space: - spec-decoding: "mtp" conc-list: [666] + recipe: "trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml" prefill: num-worker: 3 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml" decode: num-worker: 1 tp: 8 @@ -4620,14 +4192,12 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [666] + recipe: "trtllm/gb200-fp8/8k1k/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml" prefill: num-worker: 5 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml" decode: num-worker: 1 tp: 16 @@ -4635,14 +4205,12 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [333] + recipe: "trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml" prefill: num-worker: 3 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml" decode: num-worker: 1 tp: 16 @@ -4650,14 +4218,12 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [333] + recipe: "trtllm/gb200-fp8/8k1k/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml" prefill: num-worker: 4 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml" decode: num-worker: 1 tp: 32 @@ -4665,14 +4231,12 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [90] + recipe: "trtllm/gb200-fp8/8k1k/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml" prefill: num-worker: 2 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml" decode: num-worker: 1 tp: 32 @@ -4680,14 +4244,12 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [15] + recipe: "trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml" decode: num-worker: 3 tp: 8 @@ -4695,14 +4257,12 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [6] + recipe: "trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml" decode: num-worker: 3 tp: 8 @@ -4710,98 +4270,84 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: false # 8k1k STP configs - conc-list: [1229] + recipe: "trtllm/gb200-fp8/8k1k/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml" prefill: num-worker: 5 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml" decode: num-worker: 1 tp: 16 ep: 16 dp-attn: true - conc-list: [666] + recipe: "trtllm/gb200-fp8/8k1k/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml" prefill: num-worker: 4 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml" decode: num-worker: 1 tp: 32 ep: 32 dp-attn: true - conc-list: [615] + recipe: "trtllm/gb200-fp8/8k1k/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml" prefill: num-worker: 3 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml" decode: num-worker: 1 tp: 16 ep: 16 dp-attn: true - conc-list: [333] + recipe: "trtllm/gb200-fp8/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml" prefill: num-worker: 2 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml" decode: num-worker: 1 tp: 32 ep: 32 dp-attn: true - conc-list: [63] + recipe: "trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml" decode: num-worker: 3 tp: 8 ep: 8 dp-attn: false - conc-list: [18] + recipe: "trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml" decode: num-worker: 3 tp: 8 ep: 8 dp-attn: false - conc-list: [6] + recipe: "trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml" decode: num-worker: 3 tp: 8 @@ -4824,14 +4370,12 @@ dsr1-fp8-gb200-dynamo-sglang: search-space: # "Low latency" (1 prefill worker at TP4 and 1 decode worker at TP4) - conc-list: [4, 8] + recipe: "gb200-fp8/1k1k/low-latency.yaml" prefill: num-worker: 1 tp: 4 ep: 1 dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/1k1k/low-latency.yaml - - "CONFIG_FILE=recipes/gb200-fp8/1k1k/low-latency.yaml" decode: num-worker: 1 tp: 4 @@ -4840,14 +4384,12 @@ dsr1-fp8-gb200-dynamo-sglang: # "Mid curve" (3 prefill workers at DEP8 and 1 decode worker at DEP48) - conc-list: [1024, 2048, 4096] + recipe: "gb200-fp8/1k1k/mid-curve.yaml" prefill: num-worker: 3 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/1k1k/mid-curve.yaml - - "CONFIG_FILE=recipes/gb200-fp8/1k1k/mid-curve.yaml" decode: num-worker: 1 tp: 48 @@ -4856,14 +4398,12 @@ dsr1-fp8-gb200-dynamo-sglang: # "Max throughput" (2 prefill workers at DEP8 and 1 decode worker at DEP32) - conc-list: [1024, 2048, 4096, 6144] + recipe: "gb200-fp8/1k1k/max-tpt.yaml" prefill: num-worker: 2 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/1k1k/max-tpt.yaml - - "CONFIG_FILE=recipes/gb200-fp8/1k1k/max-tpt.yaml" decode: num-worker: 1 tp: 32 @@ -4872,14 +4412,12 @@ dsr1-fp8-gb200-dynamo-sglang: # "Ultra throughput" (1 prefill workers at DEP8 and 1 decode worker at DEP8) - conc-list: [4096] + recipe: "gb200-fp8/1k1k/ultra-tpt.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/1k1k/ultra-tpt.yaml - - "CONFIG_FILE=recipes/gb200-fp8/1k1k/ultra-tpt.yaml" decode: num-worker: 1 tp: 8 @@ -4891,14 +4429,12 @@ dsr1-fp8-gb200-dynamo-sglang: search-space: # "Low latency" (1 prefill worker at TP8 and 1 decode worker at TP8) - conc-list: [4, 8, 16] + recipe: "gb200-fp8/8k1k/low-latency.yaml" prefill: num-worker: 1 tp: 8 ep: 1 dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/8k1k/low-latency.yaml - - "CONFIG_FILE=recipes/gb200-fp8/8k1k/low-latency.yaml" decode: num-worker: 1 tp: 8 @@ -4907,14 +4443,12 @@ dsr1-fp8-gb200-dynamo-sglang: # "Mid curve" (5 prefill workers at DEP8 and 1 decode worker at DEP32) - conc-list: [512, 1024, 2048, 6144] + recipe: "gb200-fp8/8k1k/mid-curve.yaml" prefill: num-worker: 5 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/8k1k/mid-curve.yaml - - "CONFIG_FILE=recipes/gb200-fp8/8k1k/mid-curve.yaml" decode: num-worker: 1 tp: 32 @@ -4923,14 +4457,12 @@ dsr1-fp8-gb200-dynamo-sglang: # "Max throughput" (6 prefill workers at DEP8 and 1 decode worker at DEP24) - conc-list: [2048, 4096, 6144] + recipe: "gb200-fp8/8k1k/max_tpt.yaml" prefill: num-worker: 6 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/8k1k/max_tpt.yaml - - "CONFIG_FILE=recipes/gb200-fp8/8k1k/max_tpt.yaml" decode: num-worker: 1 tp: 24 @@ -4952,14 +4484,12 @@ dsr1-fp8-gb300-dynamo-sglang: search-space: # "Low latency" (1 prefill worker at TP4 and 4 decode workers at TP4) - conc-list: [4, 8, 16, 32] + recipe: "gb300-fp8/1k1k/stp/low-latency.yaml" prefill: num-worker: 1 tp: 4 ep: 1 dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb300-fp8/1k1k/stp/low-latency.yaml - - "CONFIG_FILE=recipes/gb300-fp8/1k1k/stp/low-latency.yaml" decode: num-worker: 4 tp: 4 @@ -4968,14 +4498,12 @@ dsr1-fp8-gb300-dynamo-sglang: # "Mid curve" (2 prefill workers at DEP8 and 1 decode worker at DEP32) - conc-list: [1024, 2048, 4096, 6144] + recipe: "gb300-fp8/1k1k/stp/mid.yaml" prefill: num-worker: 2 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb300-fp8/1k1k/stp/mid.yaml - - "CONFIG_FILE=recipes/gb300-fp8/1k1k/stp/mid.yaml" decode: num-worker: 1 tp: 32 @@ -4984,14 +4512,12 @@ dsr1-fp8-gb300-dynamo-sglang: # "Max throughput" (1 prefill worker at DEP8 and 1 decode worker at DEP8) - conc-list: [4096, 7168, 7680] + recipe: "gb300-fp8/1k1k/stp/max.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb300-fp8/1k1k/stp/max.yaml - - "CONFIG_FILE=recipes/gb300-fp8/1k1k/stp/max.yaml" decode: num-worker: 1 tp: 8 @@ -5003,14 +4529,12 @@ dsr1-fp8-gb300-dynamo-sglang: search-space: # "Low latency" (1 prefill worker at TP4 and 1 decode worker at TP4) - conc-list: [4, 8] + recipe: "gb300-fp8/8k1k/stp/low-latency.yaml" prefill: num-worker: 1 tp: 4 ep: 1 dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb300-fp8/8k1k/stp/low-latency.yaml - - "CONFIG_FILE=recipes/gb300-fp8/8k1k/stp/low-latency.yaml" decode: num-worker: 1 tp: 4 @@ -5019,14 +4543,12 @@ dsr1-fp8-gb300-dynamo-sglang: # "Mid curve" (5 prefill workers at DEP8 and 1 decode worker at DEP32) - conc-list: [128, 256, 512, 1024] + recipe: "gb300-fp8/8k1k/stp/mid.yaml" prefill: num-worker: 5 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb300-fp8/8k1k/stp/mid.yaml - - "CONFIG_FILE=recipes/gb300-fp8/8k1k/stp/mid.yaml" decode: num-worker: 1 tp: 32 @@ -5035,14 +4557,12 @@ dsr1-fp8-gb300-dynamo-sglang: # "Max throughput" (6 prefill workers at DEP8 and 1 decode worker at DEP24) - conc-list: [2048, 4096] + recipe: "gb300-fp8/8k1k/stp/max.yaml" prefill: num-worker: 6 tp: 8 ep: 8 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb300-fp8/8k1k/stp/max.yaml - - "CONFIG_FILE=recipes/gb300-fp8/8k1k/stp/max.yaml" decode: num-worker: 1 tp: 24 @@ -5066,13 +4586,12 @@ dsr1-fp4-gb200-dynamo-sglang: # Low latency (1 prefill node, 2 decode nodes) - spec-decoding: "none" conc-list: [ 4, 8, 32 ] + recipe: "gb200-fp4/1k1k/low-latency.yaml" prefill: num-worker: 1 tp: 4 ep: 1 dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/gb200-fp4/1k1k/low-latency.yaml" decode: num-worker: 2 tp: 4 @@ -5082,13 +4601,12 @@ dsr1-fp4-gb200-dynamo-sglang: # Mid curve (4 prefill nodes, 8 decode nodes) - spec-decoding: "none" conc-list: [ 512, 2048, 4096, 8192 ] + recipe: "gb200-fp4/1k1k/mid-curve.yaml" prefill: num-worker: 4 tp: 4 ep: 4 dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/gb200-fp4/1k1k/mid-curve.yaml" decode: num-worker: 1 tp: 32 @@ -5098,13 +4616,12 @@ dsr1-fp4-gb200-dynamo-sglang: # Max throughput (4 prefill nodes, 12 decode nodes) - spec-decoding: "none" conc-list: [ 2048, 4096 ] + recipe: "gb200-fp4/1k1k/max-tpt.yaml" prefill: num-worker: 4 tp: 4 ep: 4 dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/gb200-fp4/1k1k/max-tpt.yaml" decode: num-worker: 1 tp: 48 @@ -5118,13 +4635,12 @@ dsr1-fp4-gb200-dynamo-sglang: # Low latency (1 prefill node, 4 decode nodes) - spec-decoding: "none" conc-list: [ 4, 8 ] + recipe: "gb200-fp4/8k1k/low-latency.yaml" prefill: num-worker: 1 tp: 4 ep: 1 dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/gb200-fp4/8k1k/low-latency.yaml" decode: num-worker: 4 tp: 4 @@ -5134,13 +4650,12 @@ dsr1-fp4-gb200-dynamo-sglang: # Mid curve (6 prefill nodes, 12 decode nodes) - spec-decoding: "none" conc-list: [ 512, 2048, 4096 ] + recipe: "gb200-fp4/8k1k/mid-curve.yaml" prefill: num-worker: 6 tp: 4 ep: 1 dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/gb200-fp4/8k1k/mid-curve.yaml" decode: num-worker: 1 tp: 48 @@ -5150,13 +4665,12 @@ dsr1-fp4-gb200-dynamo-sglang: # Max throughput (10 prefill nodes, 8 decode nodes) - spec-decoding: "none" conc-list: [ 2048 ] + recipe: "gb200-fp4/8k1k/max-tpt.yaml" prefill: num-worker: 10 tp: 4 ep: 1 dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/gb200-fp4/8k1k/max-tpt.yaml" decode: num-worker: 1 tp: 32 @@ -5179,14 +4693,12 @@ dsr1-fp4-gb300-dynamo-trt: # MTP configurations - spec-decoding: "mtp" conc-list: [3226] + recipe: "trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml" prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml" decode: num-worker: 1 tp: 4 @@ -5194,14 +4706,12 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [333] + recipe: "trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml" prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml" decode: num-worker: 1 tp: 32 @@ -5209,14 +4719,12 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [5] + recipe: "trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml" decode: num-worker: 4 tp: 8 @@ -5224,14 +4732,12 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [8, 12, 24, 48] + recipe: "trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" decode: num-worker: 4 tp: 8 @@ -5239,14 +4745,12 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [2253] + recipe: "trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml" prefill: num-worker: 3 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml" decode: num-worker: 1 tp: 16 @@ -5254,14 +4758,12 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [1229] + recipe: "trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml" prefill: num-worker: 3 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml" decode: num-worker: 1 tp: 32 @@ -5269,84 +4771,72 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: true # Non-MTP configurations (default spec_decoding="none") - conc-list: [5] + recipe: "trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" decode: num-worker: 4 tp: 8 ep: 8 dp-attn: false - conc-list: [12, 48, 96, 192] + recipe: "trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml" decode: num-worker: 4 tp: 8 ep: 8 dp-attn: false - conc-list: [8192] + recipe: "trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 8 ep: 8 dp-attn: true - conc-list: [1229] + recipe: "trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 32 ep: 32 dp-attn: true - conc-list: [4301] + recipe: "trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml" prefill: num-worker: 3 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml" decode: num-worker: 1 tp: 16 ep: 16 dp-attn: true - conc-list: [2253] + recipe: "trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml" prefill: num-worker: 3 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 32 @@ -5358,14 +4848,12 @@ dsr1-fp4-gb300-dynamo-trt: # MTP configurations (spec_decoding="mtp") - spec-decoding: "mtp" conc-list: [33] + recipe: "trtllm/gb300-fp4/8k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml" decode: num-worker: 3 tp: 8 @@ -5373,14 +4861,12 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [5] + recipe: "trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml" decode: num-worker: 4 tp: 8 @@ -5388,14 +4874,12 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [12, 24] + recipe: "trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml" decode: num-worker: 4 tp: 8 @@ -5403,14 +4887,12 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [180] + recipe: "trtllm/gb300-fp4/8k1k/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml" prefill: num-worker: 4 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml" decode: num-worker: 1 tp: 32 @@ -5418,14 +4900,12 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [308] + recipe: "trtllm/gb300-fp4/8k1k/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml" prefill: num-worker: 8 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml" decode: num-worker: 1 tp: 32 @@ -5433,14 +4913,12 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [2253] + recipe: "trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml" prefill: num-worker: 10 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml" decode: num-worker: 1 tp: 8 @@ -5448,14 +4926,12 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [666] + recipe: "trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml" prefill: num-worker: 10 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml" decode: num-worker: 1 tp: 16 @@ -5463,14 +4939,12 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [1127] + recipe: "trtllm/gb300-fp4/8k1k/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml" prefill: num-worker: 13 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml" decode: num-worker: 1 tp: 16 @@ -5478,112 +4952,96 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: true # Non-MTP configurations (default spec_decoding="none") - conc-list: [72] + recipe: "trtllm/gb300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml" decode: num-worker: 3 tp: 8 ep: 8 dp-attn: false - conc-list: [5] + recipe: "trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" decode: num-worker: 4 tp: 8 ep: 8 dp-attn: false - conc-list: [12] + recipe: "trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml" decode: num-worker: 4 tp: 8 ep: 8 dp-attn: false - conc-list: [5, 15, 30] + recipe: "trtllm/gb300-fp4/8k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml" decode: num-worker: 5 tp: 4 ep: 4 dp-attn: false - conc-list: [666] + recipe: "trtllm/gb300-fp4/8k1k/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml" prefill: num-worker: 7 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 32 ep: 32 dp-attn: true - conc-list: [1229] + recipe: "trtllm/gb300-fp4/8k1k/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml" prefill: num-worker: 9 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 16 ep: 16 dp-attn: true - conc-list: [3228] + recipe: "trtllm/gb300-fp4/8k1k/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml" prefill: num-worker: 11 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml" decode: num-worker: 3 tp: 4 ep: 4 dp-attn: true - conc-list: [2253] + recipe: "trtllm/gb300-fp4/8k1k/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml" prefill: num-worker: 14 tp: 2 ep: 2 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 16 @@ -5607,13 +5065,12 @@ dsr1-fp4-gb300-dynamo-sglang: # Low latency (1 prefill node, 2 decode nodes) - spec-decoding: "none" conc-list: [ 4, 8, 32 ] + recipe: "gb300-fp4/1k1k/low_latency.yaml" prefill: num-worker: 1 tp: 4 ep: 1 dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/gb300-fp4/1k1k/low_latency.yaml" decode: num-worker: 2 tp: 4 @@ -5623,13 +5080,12 @@ dsr1-fp4-gb300-dynamo-sglang: # Mid curve (4 prefill nodes, 8 decode nodes) - spec-decoding: "none" conc-list: [ 512, 2048, 4096, 8192 ] + recipe: "gb300-fp4/1k1k/mid_curve.yaml" prefill: num-worker: 4 tp: 4 ep: 4 dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/gb300-fp4/1k1k/mid_curve.yaml" decode: num-worker: 1 tp: 32 @@ -5639,13 +5095,12 @@ dsr1-fp4-gb300-dynamo-sglang: # Max throughput (4 prefill nodes, 12 decode nodes) - spec-decoding: "none" conc-list: [ 512, 2048, 4096, 8192 ] + recipe: "gb300-fp4/1k1k/max_tpt.yaml" prefill: num-worker: 4 tp: 4 ep: 4 dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/gb300-fp4/1k1k/max_tpt.yaml" decode: num-worker: 1 tp: 48 @@ -5659,13 +5114,12 @@ dsr1-fp4-gb300-dynamo-sglang: # Low latency (1 prefill node, 4 decode nodes) - spec-decoding: "none" conc-list: [ 4, 8, 32, 64 ] + recipe: "gb300-fp4/8k1k/low_latency.yaml" prefill: num-worker: 1 tp: 4 ep: 1 dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/gb300-fp4/8k1k/low_latency.yaml" decode: num-worker: 4 tp: 4 @@ -5675,13 +5129,12 @@ dsr1-fp4-gb300-dynamo-sglang: # Mid curve (6 prefill nodes, 12 decode nodes) - spec-decoding: "none" conc-list: [ 512, 2048, 4096 ] + recipe: "gb300-fp4/8k1k/mid_curve.yaml" prefill: num-worker: 6 tp: 4 ep: 1 dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/gb300-fp4/8k1k/mid_curve.yaml" decode: num-worker: 1 tp: 48 @@ -5691,13 +5144,12 @@ dsr1-fp4-gb300-dynamo-sglang: # Max throughput (10 prefill nodes, 8 decode nodes) - spec-decoding: "none" conc-list: [ 2048 ] + recipe: "gb300-fp4/8k1k/max_tpt.yaml" prefill: num-worker: 10 tp: 4 ep: 1 dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/gb300-fp4/8k1k/max_tpt.yaml" decode: num-worker: 1 tp: 32 @@ -5720,14 +5172,12 @@ dsr1-fp8-gb300-dynamo-trt: # MTP configurations (spec_decoding="mtp") - spec-decoding: "mtp" conc-list: [8] + recipe: "trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml" decode: num-worker: 4 tp: 8 @@ -5735,14 +5185,12 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [24] + recipe: "trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml" decode: num-worker: 4 tp: 8 @@ -5750,14 +5198,12 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [180] + recipe: "trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml" decode: num-worker: 1 tp: 32 @@ -5765,14 +5211,12 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [564] + recipe: "trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml" prefill: num-worker: 2 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml" decode: num-worker: 1 tp: 32 @@ -5780,14 +5224,12 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [666] + recipe: "trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml" decode: num-worker: 1 tp: 16 @@ -5795,14 +5237,12 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [2253] + recipe: "trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml" prefill: num-worker: 2 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml" decode: num-worker: 1 tp: 16 @@ -5810,14 +5250,12 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [8192] + recipe: "trtllm/gb300-fp8/1k1k/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml" prefill: num-worker: 3 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml" decode: num-worker: 2 tp: 8 @@ -5825,98 +5263,84 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: true # STP configurations (no spec_decoding) - conc-list: [4] + recipe: "trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml" decode: num-worker: 4 tp: 8 ep: 8 dp-attn: false - conc-list: [24] + recipe: "trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml" decode: num-worker: 4 tp: 8 ep: 8 dp-attn: false - conc-list: [84] + recipe: "trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml" decode: num-worker: 4 tp: 8 ep: 8 dp-attn: false - conc-list: [1229] + recipe: "trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml" prefill: num-worker: 2 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml" decode: num-worker: 1 tp: 32 ep: 32 dp-attn: true - conc-list: [2253] + recipe: "trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml" prefill: num-worker: 2 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml" decode: num-worker: 1 tp: 16 ep: 16 dp-attn: true - conc-list: [8602] + recipe: "trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml" prefill: num-worker: 3 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml" decode: num-worker: 2 tp: 8 ep: 8 dp-attn: true - conc-list: [12288] + recipe: "trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml" prefill: num-worker: 3 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml" decode: num-worker: 2 tp: 8 @@ -5928,14 +5352,12 @@ dsr1-fp8-gb300-dynamo-trt: # MTP configurations (spec_decoding="mtp") - spec-decoding: "mtp" conc-list: [8] + recipe: "trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml" decode: num-worker: 4 tp: 8 @@ -5943,14 +5365,12 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [24] + recipe: "trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml" decode: num-worker: 4 tp: 8 @@ -5958,14 +5378,12 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [333] + recipe: "trtllm/gb300-fp8/8k1k/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml" prefill: num-worker: 6 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml" decode: num-worker: 1 tp: 32 @@ -5973,14 +5391,12 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [666] + recipe: "trtllm/gb300-fp8/8k1k/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml" prefill: num-worker: 8 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml" decode: num-worker: 1 tp: 16 @@ -5988,14 +5404,12 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [1229] + recipe: "trtllm/gb300-fp8/8k1k/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml" prefill: num-worker: 10 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml" decode: num-worker: 1 tp: 16 @@ -6003,14 +5417,12 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [1229] + recipe: "trtllm/gb300-fp8/8k1k/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml" prefill: num-worker: 7 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml" decode: num-worker: 1 tp: 8 @@ -6018,98 +5430,84 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: true # STP configurations (no spec_decoding) - conc-list: [4] + recipe: "trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml" decode: num-worker: 4 tp: 8 ep: 8 dp-attn: false - conc-list: [24] + recipe: "trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml" decode: num-worker: 4 tp: 8 ep: 8 dp-attn: false - conc-list: [36] + recipe: "trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml" decode: num-worker: 4 tp: 8 ep: 8 dp-attn: false - conc-list: [512] + recipe: "trtllm/gb300-fp8/8k1k/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml" prefill: num-worker: 6 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml" decode: num-worker: 1 tp: 32 ep: 32 dp-attn: true - conc-list: [666] + recipe: "trtllm/gb300-fp8/8k1k/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml" prefill: num-worker: 4 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml" decode: num-worker: 1 tp: 16 ep: 16 dp-attn: true - conc-list: [1229] + recipe: "trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml" prefill: num-worker: 7 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml" decode: num-worker: 1 tp: 16 ep: 16 dp-attn: true - conc-list: [2151] + recipe: "trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml" prefill: num-worker: 7 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml" decode: num-worker: 1 tp: 8 @@ -6402,13 +5800,12 @@ dsr1-fp8-h200-dynamo-sglang: # STP: Low latency (1 prefill, 9 decode, TEP) - spec-decoding: "none" conc-list: [1, 4, 8, 16, 32, 64, 128, 256] + recipe: "h200/1k1k/low-latency-1p9d.yaml" prefill: num-worker: 1 tp: 8 ep: 1 dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h200/1k1k/low-latency-1p9d.yaml" decode: num-worker: 9 tp: 8 @@ -6417,13 +5814,12 @@ dsr1-fp8-h200-dynamo-sglang: # STP: High throughput TEP (1 prefill, 6 decode) - spec-decoding: "none" conc-list: [512, 1024, 2048] + recipe: "h200/1k1k/bs256-1p6d-tp.yaml" prefill: num-worker: 1 tp: 8 ep: 1 dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h200/1k1k/bs256-1p6d-tp.yaml" decode: num-worker: 6 tp: 8 @@ -6432,13 +5828,12 @@ dsr1-fp8-h200-dynamo-sglang: # STP: High throughput DEP (1 prefill, 6 decode, dp-attention) - spec-decoding: "none" conc-list: [128, 256, 512, 1024, 2048] + recipe: "h200/1k1k/bs256-1p6d-dep.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/h200/1k1k/bs256-1p6d-dep.yaml" decode: num-worker: 6 tp: 8 @@ -6447,13 +5842,12 @@ dsr1-fp8-h200-dynamo-sglang: # MTP: Low latency (1 prefill, 9 decode, TEP) - spec-decoding: "mtp" conc-list: [1, 4, 8, 16, 32, 64, 128, 256] + recipe: "h200/1k1k/low-latency-1p9d-mtp.yaml" prefill: num-worker: 1 tp: 8 ep: 1 dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h200/1k1k/low-latency-1p9d-mtp.yaml" decode: num-worker: 9 tp: 8 @@ -6462,13 +5856,12 @@ dsr1-fp8-h200-dynamo-sglang: # MTP: High throughput TEP (1 prefill, 6 decode) - spec-decoding: "mtp" conc-list: [512, 1024, 2048] + recipe: "h200/1k1k/bs256-1p6d-tp-mtp.yaml" prefill: num-worker: 1 tp: 8 ep: 1 dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h200/1k1k/bs256-1p6d-tp-mtp.yaml" decode: num-worker: 6 tp: 8 @@ -6477,13 +5870,12 @@ dsr1-fp8-h200-dynamo-sglang: # MTP: High throughput DEP (1 prefill, 6 decode, dp-attention) - spec-decoding: "mtp" conc-list: [128, 256, 512, 1024, 2048] + recipe: "h200/1k1k/bs256-1p6d-dep-mtp.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/h200/1k1k/bs256-1p6d-dep-mtp.yaml" decode: num-worker: 6 tp: 8 @@ -6495,13 +5887,12 @@ dsr1-fp8-h200-dynamo-sglang: # STP: Low latency TEP (1 prefill, 7 decode) - spec-decoding: "none" conc-list: [1, 4, 8] + recipe: "h200/8k1k/bs4-1p7d.yaml" prefill: num-worker: 1 tp: 8 ep: 1 dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h200/8k1k/bs4-1p7d.yaml" decode: num-worker: 7 tp: 8 @@ -6510,13 +5901,12 @@ dsr1-fp8-h200-dynamo-sglang: # STP: TEP (1 prefill, 6 decode) - spec-decoding: "none" conc-list: [4, 8, 16] + recipe: "h200/8k1k/bs8-1p6d.yaml" prefill: num-worker: 1 tp: 8 ep: 1 dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h200/8k1k/bs8-1p6d.yaml" decode: num-worker: 6 tp: 8 @@ -6525,13 +5915,12 @@ dsr1-fp8-h200-dynamo-sglang: # STP: TEP (1 prefill, 3 decode) - spec-decoding: "none" conc-list: [8, 16, 32] + recipe: "h200/8k1k/bs16-1p3d.yaml" prefill: num-worker: 1 tp: 8 ep: 1 dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h200/8k1k/bs16-1p3d.yaml" decode: num-worker: 3 tp: 8 @@ -6540,13 +5929,12 @@ dsr1-fp8-h200-dynamo-sglang: # STP: TEP (2 prefill, 3 decode) - spec-decoding: "none" conc-list: [32, 64, 128] + recipe: "h200/8k1k/bs64-2p3d.yaml" prefill: num-worker: 2 tp: 8 ep: 1 dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h200/8k1k/bs64-2p3d.yaml" decode: num-worker: 3 tp: 8 @@ -6555,13 +5943,12 @@ dsr1-fp8-h200-dynamo-sglang: # STP: High throughput DEP (1 prefill, 1 decode, dp-attention) - spec-decoding: "none" conc-list: [64, 128, 256] + recipe: "h200/8k1k/bs128-1p1d-dep.yaml" prefill: num-worker: 1 tp: 8 ep: 1 dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h200/8k1k/bs128-1p1d-dep.yaml" decode: num-worker: 1 tp: 8 @@ -6570,13 +5957,12 @@ dsr1-fp8-h200-dynamo-sglang: # MTP: Low latency TEP (1 prefill, 7 decode) - spec-decoding: "mtp" conc-list: [1, 4, 8] + recipe: "h200/8k1k/bs4-1p7d-mtp.yaml" prefill: num-worker: 1 tp: 8 ep: 1 dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h200/8k1k/bs4-1p7d-mtp.yaml" decode: num-worker: 7 tp: 8 @@ -6585,13 +5971,12 @@ dsr1-fp8-h200-dynamo-sglang: # MTP: TEP (1 prefill, 6 decode) - spec-decoding: "mtp" conc-list: [2, 4, 8, 16, 32] + recipe: "h200/8k1k/bs8-1p6d-mtp.yaml" prefill: num-worker: 1 tp: 8 ep: 1 dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h200/8k1k/bs8-1p6d-mtp.yaml" decode: num-worker: 6 tp: 8 @@ -6600,13 +5985,12 @@ dsr1-fp8-h200-dynamo-sglang: # MTP: TEP (1 prefill, 3 decode) - spec-decoding: "mtp" conc-list: [4, 8, 16, 32, 64] + recipe: "h200/8k1k/bs16-1p3d-mtp.yaml" prefill: num-worker: 1 tp: 8 ep: 1 dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h200/8k1k/bs16-1p3d-mtp.yaml" decode: num-worker: 3 tp: 8 @@ -6615,13 +5999,12 @@ dsr1-fp8-h200-dynamo-sglang: # MTP: TEP (2 prefill, 3 decode) - spec-decoding: "mtp" conc-list: [32, 64, 128] + recipe: "h200/8k1k/bs64-2p3d-mtp.yaml" prefill: num-worker: 2 tp: 8 ep: 1 dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h200/8k1k/bs64-2p3d-mtp.yaml" decode: num-worker: 3 tp: 8 @@ -6630,13 +6013,12 @@ dsr1-fp8-h200-dynamo-sglang: # MTP: High throughput DEP (1 prefill, 1 decode, dp-attention) - spec-decoding: "mtp" conc-list: [32, 64, 128, 256, 512] + recipe: "h200/8k1k/bs128-1p1d-dep-mtp.yaml" prefill: num-worker: 1 tp: 8 ep: 1 dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h200/8k1k/bs128-1p1d-dep-mtp.yaml" decode: num-worker: 1 tp: 8 @@ -6658,52 +6040,48 @@ dsr1-fp4-b200-dynamo-sglang: search-space: # Non-MTP configurations - conc-list: [16, 128] + recipe: "b200-fp4/1k1k.yaml:zip_override_stp_lowlat[0]" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_stp_lowlat[0]" decode: num-worker: 5 tp: 8 ep: 8 dp-attn: false - conc-list: [32, 64, 256] + recipe: "b200-fp4/1k1k.yaml:zip_override_stp_lowlat[1]" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_stp_lowlat[1]" decode: num-worker: 6 tp: 8 ep: 8 dp-attn: false - conc-list: [512] + recipe: "b200-fp4/1k1k.yaml:zip_override_stp_maxtpt[0]" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_stp_maxtpt[0]" decode: num-worker: 1 tp: 8 ep: 8 dp-attn: true - conc-list: [512] + recipe: "b200-fp4/1k1k.yaml:zip_override_stp_maxtpt[1]" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_stp_maxtpt[1]" decode: num-worker: 2 tp: 8 @@ -6714,65 +6092,60 @@ dsr1-fp4-b200-dynamo-sglang: search-space: # Non-MTP configurations - conc-list: [64, 128] + recipe: "b200-fp4/8k1k.yaml:zip_override_stp_lowlat[0]" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_stp_lowlat[0]" decode: num-worker: 1 tp: 8 ep: 8 dp-attn: false - conc-list: [8] + recipe: "b200-fp4/8k1k.yaml:zip_override_stp_lowlat[1]" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_stp_lowlat[1]" decode: num-worker: 5 tp: 8 ep: 8 dp-attn: false - conc-list: [4, 128] + recipe: "b200-fp4/8k1k.yaml:zip_override_stp_lowlat[2]" prefill: num-worker: 2 tp: 4 ep: 4 dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_stp_lowlat[2]" decode: num-worker: 5 tp: 8 ep: 8 dp-attn: false - conc-list: [4, 8, 16, 64] + recipe: "b200-fp4/8k1k.yaml:override_stp_tp4" prefill: num-worker: 1 tp: 4 ep: 1 dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:override_stp_tp4" decode: num-worker: 1 tp: 8 ep: 1 dp-attn: false - conc-list: [1024, 2048] + recipe: "b200-fp4/8k1k.yaml:override_stp_maxtpt_7p2d" prefill: num-worker: 7 tp: 4 ep: 4 dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:override_stp_maxtpt_7p2d" decode: num-worker: 2 tp: 8 @@ -6794,52 +6167,48 @@ dsr1-fp8-b200-dynamo-sglang: search-space: # Non-MTP configurations - conc-list: [4] + recipe: "b200-fp8/1k1k.yaml:zip_override_stp_lowlat[0]" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_stp_lowlat[0]" decode: num-worker: 1 tp: 8 ep: 8 dp-attn: false - conc-list: [16, 32, 64, 128, 256] + recipe: "b200-fp8/1k1k.yaml:zip_override_stp_lowlat[1]" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_stp_lowlat[1]" decode: num-worker: 3 tp: 8 ep: 8 dp-attn: false - conc-list: [1024, 2048, 4096] + recipe: "b200-fp8/1k1k.yaml:zip_override_stp_maxtpt[0]" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_stp_maxtpt[0]" decode: num-worker: 5 tp: 8 ep: 8 dp-attn: true - conc-list: [2048, 4096] + recipe: "b200-fp8/1k1k.yaml:zip_override_stp_maxtpt[1]" prefill: num-worker: 2 tp: 8 ep: 8 dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_stp_maxtpt[1]" decode: num-worker: 5 tp: 8 @@ -6850,42 +6219,36 @@ dsr1-fp8-b200-dynamo-sglang: search-space: # STP low-latency: resolved from 8k1k.yaml zip_override_stp_lowlat - conc-list: [128] + recipe: "b200-fp8/8k1k_stp_lowlat_0.yaml" prefill: num-worker: 1 tp: 8 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_lowlat_0.yaml - - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_lowlat_0.yaml" decode: num-worker: 3 tp: 8 ep: 1 dp-attn: false - conc-list: [128] + recipe: "b200-fp8/8k1k_stp_lowlat_1.yaml" prefill: num-worker: 1 tp: 8 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_lowlat_1.yaml - - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_lowlat_1.yaml" decode: num-worker: 4 tp: 8 ep: 1 dp-attn: false - conc-list: [8, 16, 32, 64, 128] + recipe: "b200-fp8/8k1k_stp_lowlat_2.yaml" prefill: num-worker: 1 tp: 8 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_lowlat_2.yaml - - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_lowlat_2.yaml" decode: num-worker: 6 tp: 8 @@ -6893,56 +6256,48 @@ dsr1-fp8-b200-dynamo-sglang: dp-attn: false # STP max-throughput: resolved from 8k1k.yaml zip_override_stp_maxtpt - conc-list: [288] + recipe: "b200-fp8/8k1k_stp_maxtpt_0.yaml" prefill: num-worker: 1 tp: 8 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_maxtpt_0.yaml - - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_maxtpt_0.yaml" decode: num-worker: 2 tp: 8 ep: 8 dp-attn: true - conc-list: [160, 288] + recipe: "b200-fp8/8k1k_stp_maxtpt_1.yaml" prefill: num-worker: 1 tp: 8 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_maxtpt_1.yaml - - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_maxtpt_1.yaml" decode: num-worker: 1 tp: 8 ep: 8 dp-attn: true - conc-list: [512] + recipe: "b200-fp8/8k1k_stp_maxtpt_2.yaml" prefill: num-worker: 2 tp: 8 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_maxtpt_2.yaml - - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_maxtpt_2.yaml" decode: num-worker: 1 tp: 8 ep: 8 dp-attn: true - conc-list: [1024] + recipe: "b200-fp8/8k1k_stp_maxtpt_3.yaml" prefill: num-worker: 3 tp: 8 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_maxtpt_3.yaml - - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_maxtpt_3.yaml" decode: num-worker: 1 tp: 8 @@ -6965,13 +6320,12 @@ dsr1-fp8-b200-dynamo-sglang-mtp: # MTP low-latency: 1P1D - spec-decoding: "mtp" conc-list: [4, 64] + recipe: "b200-fp8/1k1k.yaml:zip_override_mtp_lowlat[0]" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_mtp_lowlat[0]" decode: num-worker: 1 tp: 8 @@ -6980,13 +6334,12 @@ dsr1-fp8-b200-dynamo-sglang-mtp: # MTP low-latency: 1P3D - spec-decoding: "mtp" conc-list: [4, 8, 16, 32, 128] + recipe: "b200-fp8/1k1k.yaml:zip_override_mtp_lowlat[1]" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_mtp_lowlat[1]" decode: num-worker: 3 tp: 8 @@ -6995,13 +6348,12 @@ dsr1-fp8-b200-dynamo-sglang-mtp: # MTP max-tpt: 1P5D - spec-decoding: "mtp" conc-list: [512, 4096] + recipe: "b200-fp8/1k1k.yaml:zip_override_mtp_maxtpt[1]" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_mtp_maxtpt[1]" decode: num-worker: 5 tp: 8 @@ -7010,13 +6362,12 @@ dsr1-fp8-b200-dynamo-sglang-mtp: # MTP max-tpt: 2P5D - spec-decoding: "mtp" conc-list: [1024, 2048, 4096] + recipe: "b200-fp8/1k1k.yaml:zip_override_mtp_maxtpt[2]" prefill: num-worker: 2 tp: 8 ep: 8 dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_mtp_maxtpt[2]" decode: num-worker: 5 tp: 8 @@ -7025,13 +6376,12 @@ dsr1-fp8-b200-dynamo-sglang-mtp: # MTP max-tpt: 1P2D - spec-decoding: "mtp" conc-list: [512, 1024, 2048] + recipe: "b200-fp8/1k1k.yaml:override_mtp_maxtpt_1p2d" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:override_mtp_maxtpt_1p2d" decode: num-worker: 2 tp: 8 @@ -7043,14 +6393,12 @@ dsr1-fp8-b200-dynamo-sglang-mtp: # MTP low-latency: resolved from 8k1k.yaml zip_override_mtp_lowlat - spec-decoding: "mtp" conc-list: [128] + recipe: "b200-fp8/8k1k_mtp_lowlat_0.yaml" prefill: num-worker: 1 tp: 8 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_lowlat_0.yaml - - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_lowlat_0.yaml" decode: num-worker: 3 tp: 8 @@ -7058,14 +6406,12 @@ dsr1-fp8-b200-dynamo-sglang-mtp: dp-attn: false - spec-decoding: "mtp" conc-list: [128] + recipe: "b200-fp8/8k1k_mtp_lowlat_1.yaml" prefill: num-worker: 1 tp: 8 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_lowlat_1.yaml - - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_lowlat_1.yaml" decode: num-worker: 4 tp: 8 @@ -7073,14 +6419,12 @@ dsr1-fp8-b200-dynamo-sglang-mtp: dp-attn: false - spec-decoding: "mtp" conc-list: [8, 16, 32, 64, 128] + recipe: "b200-fp8/8k1k_mtp_lowlat_2.yaml" prefill: num-worker: 1 tp: 8 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_lowlat_2.yaml - - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_lowlat_2.yaml" decode: num-worker: 6 tp: 8 @@ -7089,14 +6433,12 @@ dsr1-fp8-b200-dynamo-sglang-mtp: # MTP max-throughput: resolved from 8k1k.yaml zip_override_mtp_maxtpt - spec-decoding: "mtp" conc-list: [288] + recipe: "b200-fp8/8k1k_mtp_maxtpt_0.yaml" prefill: num-worker: 1 tp: 8 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_maxtpt_0.yaml - - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_maxtpt_0.yaml" decode: num-worker: 2 tp: 8 @@ -7104,14 +6446,12 @@ dsr1-fp8-b200-dynamo-sglang-mtp: dp-attn: true - spec-decoding: "mtp" conc-list: [160, 288] + recipe: "b200-fp8/8k1k_mtp_maxtpt_1.yaml" prefill: num-worker: 1 tp: 8 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_maxtpt_1.yaml - - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_maxtpt_1.yaml" decode: num-worker: 1 tp: 8 @@ -7119,14 +6459,12 @@ dsr1-fp8-b200-dynamo-sglang-mtp: dp-attn: true - spec-decoding: "mtp" conc-list: [512] + recipe: "b200-fp8/8k1k_mtp_maxtpt_2.yaml" prefill: num-worker: 2 tp: 8 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_maxtpt_2.yaml - - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_maxtpt_2.yaml" decode: num-worker: 1 tp: 8 @@ -7134,14 +6472,12 @@ dsr1-fp8-b200-dynamo-sglang-mtp: dp-attn: true - spec-decoding: "mtp" conc-list: [1024] + recipe: "b200-fp8/8k1k_mtp_maxtpt_3.yaml" prefill: num-worker: 3 tp: 8 ep: 1 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_maxtpt_3.yaml - - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_maxtpt_3.yaml" decode: num-worker: 1 tp: 8 @@ -7163,14 +6499,12 @@ dsr1-fp4-b200-dynamo-sglang-mtp: search-space: - spec-decoding: "mtp" conc-list: [16, 512] + recipe: "b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[0]" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml - - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[0]" decode: num-worker: 5 tp: 8 @@ -7178,14 +6512,12 @@ dsr1-fp4-b200-dynamo-sglang-mtp: dp-attn: false - spec-decoding: "mtp" conc-list: [32, 64, 256, 512] + recipe: "b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[1]" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml - - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[1]" decode: num-worker: 6 tp: 8 @@ -7193,14 +6525,12 @@ dsr1-fp4-b200-dynamo-sglang-mtp: dp-attn: false - spec-decoding: "mtp" conc-list: [512, 1024] + recipe: "b200-fp4/1k1k.yaml:zip_override_mtp_maxtpt[0]" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml - - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_maxtpt[0]" decode: num-worker: 1 tp: 8 @@ -7208,14 +6538,12 @@ dsr1-fp4-b200-dynamo-sglang-mtp: dp-attn: true - spec-decoding: "mtp" conc-list: [512] + recipe: "b200-fp4/1k1k.yaml:zip_override_mtp_maxtpt[1]" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml - - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_maxtpt[1]" decode: num-worker: 2 tp: 8 @@ -7229,14 +6557,12 @@ dsr1-fp4-b200-dynamo-sglang-mtp: search-space: - spec-decoding: "mtp" conc-list: [64, 128] + recipe: "b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[0]" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml - - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[0]" decode: num-worker: 1 tp: 8 @@ -7244,14 +6570,12 @@ dsr1-fp4-b200-dynamo-sglang-mtp: dp-attn: false - spec-decoding: "mtp" conc-list: [8] + recipe: "b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[1]" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml - - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[1]" decode: num-worker: 5 tp: 8 @@ -7259,14 +6583,12 @@ dsr1-fp4-b200-dynamo-sglang-mtp: dp-attn: false - spec-decoding: "mtp" conc-list: [4, 128] + recipe: "b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[2]" prefill: num-worker: 2 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml - - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[2]" decode: num-worker: 5 tp: 8 @@ -7274,14 +6596,12 @@ dsr1-fp4-b200-dynamo-sglang-mtp: dp-attn: false - spec-decoding: "mtp" conc-list: [4, 8, 16, 64] + recipe: "b200-fp4/8k1k.yaml:override_mtp_tp4" prefill: num-worker: 1 tp: 4 ep: 1 dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml - - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:override_mtp_tp4" decode: num-worker: 1 tp: 8 @@ -7303,98 +6623,84 @@ kimik2.5-fp4-gb200-dynamo-trt: search-space: # Non-MTP configurations (default spec_decoding="none") - conc-list: [ 4, 192, 360, 668 ] + recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml" decode: num-worker: 4 tp: 8 ep: 8 dp-attn: false - conc-list: [ 5, 15, 30, 55 ] + recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml" decode: num-worker: 5 tp: 4 ep: 4 dp-attn: false - conc-list: [ 666 ] + recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 16 ep: 16 dp-attn: true - conc-list: [ 2253 ] + recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 32 ep: 32 dp-attn: true - conc-list: [ 4301, 6452 ] + recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 8 ep: 8 dp-attn: true - conc-list: [ 4301 ] + recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 16 ep: 16 dp-attn: true - conc-list: [ 4301 ] + recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 32 @@ -7406,98 +6712,84 @@ kimik2.5-fp4-gb200-dynamo-trt: search-space: # Non-MTP configurations (default spec_decoding="none") - conc-list: [ 4 ] + recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml" decode: num-worker: 4 tp: 8 ep: 8 dp-attn: false - conc-list: [ 156 ] + recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml" decode: num-worker: 4 tp: 4 ep: 4 dp-attn: false - conc-list: [ 5, 15, 30, 60, 105 ] + recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml" decode: num-worker: 5 tp: 4 ep: 4 dp-attn: false - conc-list: [ 333 ] + recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 16 ep: 16 dp-attn: true - conc-list: [ 615 ] + recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml" prefill: num-worker: 3 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 16 ep: 16 dp-attn: true - conc-list: [ 2151 ] + recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml" prefill: num-worker: 5 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 8 ep: 8 dp-attn: true - conc-list: [ 2253 ] + recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml" prefill: num-worker: 7 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 16 @@ -7518,28 +6810,24 @@ kimik2.5-fp4-gb200-dynamo-vllm: osl: 1024 search-space: - conc-list: [256, 512, 1024, 2048, 3072, 4096] + recipe: "vllm/kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml - - "CONFIG_FILE=recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml" decode: num-worker: 1 tp: 16 ep: 16 dp-attn: true - conc-list: [4, 8, 16, 32, 64, 128] + recipe: "vllm/kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml - - "CONFIG_FILE=recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml" decode: num-worker: 4 tp: 4 @@ -7549,56 +6837,48 @@ kimik2.5-fp4-gb200-dynamo-vllm: osl: 1024 search-space: - conc-list: [4, 8, 16, 32, 128] + recipe: "vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml" prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml - - "CONFIG_FILE=recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml" decode: num-worker: 4 tp: 4 ep: 4 dp-attn: false - conc-list: [512, 1024] + recipe: "vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml" prefill: num-worker: 3 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml - - "CONFIG_FILE=recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml" decode: num-worker: 1 tp: 16 ep: 16 dp-attn: true - conc-list: [2048] + recipe: "vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml" prefill: num-worker: 5 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml - - "CONFIG_FILE=recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml" decode: num-worker: 1 tp: 8 ep: 8 dp-attn: true - conc-list: [3072, 4096] + recipe: "vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml" prefill: num-worker: 6 tp: 4 ep: 4 dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml - - "CONFIG_FILE=recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml" decode: num-worker: 1 tp: 16 @@ -7625,13 +6905,12 @@ dsv4-fp4-gb200-dynamo-vllm: # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch # 1p1d-dep8-tep8.yaml (offload + numa-bind stripped — see recipe header). - conc-list: [1, 4, 8, 16, 32, 64] + recipe: "vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml" decode: num-worker: 1 tp: 8 @@ -7640,13 +6919,12 @@ dsv4-fp4-gb200-dynamo-vllm: # Mid throughput: 1 prefill (DP=8) + 1 wide decode (DP=16). # 6 nodes. Single prefill is plenty for 1k prompts up to ~conc 4096. - conc-list: [128, 256, 1024, 2048, 4096] + recipe: "vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml" decode: num-worker: 1 tp: 16 @@ -7656,13 +6934,12 @@ dsv4-fp4-gb200-dynamo-vllm: # The 4096 overlap with the 1p1d block gives a crossover point. 8192 # would saturate 1p1d's prefill, so this topology takes over there. - conc-list: [4096, 8192] + recipe: "vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml" prefill: num-worker: 3 tp: 8 ep: 8 dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml" decode: num-worker: 1 tp: 16 @@ -7675,13 +6952,12 @@ dsv4-fp4-gb200-dynamo-vllm: # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8). # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch. - conc-list: [1, 4, 8, 16, 32, 64] + recipe: "vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml" prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml" decode: num-worker: 1 tp: 8 @@ -7689,13 +6965,12 @@ dsv4-fp4-gb200-dynamo-vllm: dp-attn: false # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total. - conc-list: [512, 1024] + recipe: "vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml" prefill: num-worker: 3 tp: 8 ep: 8 dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml" decode: num-worker: 1 tp: 16 @@ -7704,13 +6979,12 @@ dsv4-fp4-gb200-dynamo-vllm: # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes # (full cluster). Mirrors NVIDIA/srt-slurm PR #67. - conc-list: [4096, 8192] + recipe: "vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml" prefill: num-worker: 7 tp: 8 ep: 8 dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml" decode: num-worker: 1 tp: 16 diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index 75036a986..b6b6a30f3 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -77,6 +77,11 @@ on: required: false type: string default: "[]" + recipe: + description: "Path under benchmarks/multi_node/srt-slurm-recipes/ identifying the srt-slurm recipe to dispatch. May carry an `:override[N]` suffix. Empty for non-srt-slurm multi-node configs." + required: false + type: string + default: "" run-eval: type: boolean required: false @@ -165,6 +170,7 @@ jobs: env: RUNNER_NAME: ${{ runner.name }} RUNNER_TYPE: ${{ inputs.runner }} + RECIPE: ${{ inputs.recipe }} # Hash uniquely on {EXP_NAME}_{PRECISION}_{FRAMEWORK}_prefill-tp{}-ep{}-dp{}-nw{}_decode-tp{}-ep{}-dp{}-nw{}_disagg-{}_spec-{}_conc{}_{runner} RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_prefill-tp${{ env.PREFILL_TP }}-ep${{ env.PREFILL_EP }}-dp${{ env.PREFILL_DP_ATTN }}-nw${{ env.PREFILL_NUM_WORKERS }}_decode-tp${{ env.DECODE_TP }}-ep${{ env.DECODE_EP }}-dp${{ env.DECODE_DP_ATTN }}-nw${{ env.DECODE_NUM_WORKERS }}_disagg-${{ env.DISAGG }}_spec-${{ env.SPEC_DECODING }}_conc${{ join(fromJson(inputs.conc-list), 'x') }}_${{ runner.name }} run: | @@ -173,6 +179,23 @@ jobs: echo "RESULT_FILENAME=${RESULT_FILENAME}" >> $GITHUB_ENV export ${{ join(fromJson(inputs.prefill-additional-settings), ' ') }} ${{ join(fromJson(inputs.decode-additional-settings), ' ') }} + # Resolve `recipe` (path relative to benchmarks/multi_node/srt-slurm-recipes/, + # optionally ending in `:override[N]`) into an absolute CONFIG_FILE for the + # launcher. Copy the recipe to a scratch path first so the launcher's + # `sed -i` rewrites (job name, health-check timeout, ...) don't mutate the + # tracked file in-place between runs. + if [[ -n "$RECIPE" ]]; then + recipe_path="${RECIPE%%:*}" + recipe_suffix="" + if [[ "$RECIPE" == *:* ]]; then + recipe_suffix=":${RECIPE#*:}" + fi + src="${GITHUB_WORKSPACE}/benchmarks/multi_node/srt-slurm-recipes/${recipe_path}" + scratch_dir="$(mktemp -d)" + scratch_recipe="${scratch_dir}/$(basename "$recipe_path")" + cp "$src" "$scratch_recipe" + export CONFIG_FILE="${scratch_recipe}${recipe_suffix}" + fi export IS_MULTINODE=true bash ./runners/launch_${RUNNER_NAME%%_*}.sh if [ "${{ inputs.eval-only }}" = "true" ]; then diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index 74d4889f3..f8961f7b4 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -102,6 +102,7 @@ jobs: decode-ep: ${{ matrix.config.decode.ep }} decode-dp-attn: ${{ matrix.config.decode.dp-attn }} decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} + recipe: ${{ matrix.config.recipe }} run-eval: false ref: ${{ inputs.ref }} @@ -141,6 +142,7 @@ jobs: decode-ep: ${{ matrix.config.decode.ep }} decode-dp-attn: ${{ matrix.config.decode.dp-attn }} decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} + recipe: ${{ matrix.config.recipe }} run-eval: true eval-only: true eval-conc: ${{ matrix.config.eval-conc }} diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml index fd1fa91be..4dea7065a 100644 --- a/.github/workflows/run-sweep.yml +++ b/.github/workflows/run-sweep.yml @@ -138,6 +138,7 @@ jobs: decode-ep: ${{ matrix.config.decode.ep }} decode-dp-attn: ${{ matrix.config.decode.dp-attn }} decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} + recipe: ${{ matrix.config.recipe }} run-eval: false sweep-multi-node-8k1k: @@ -257,6 +258,7 @@ jobs: decode-ep: ${{ matrix.config.decode.ep }} decode-dp-attn: ${{ matrix.config.decode.dp-attn }} decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} + recipe: ${{ matrix.config.recipe }} run-eval: true eval-only: true eval-conc: ${{ matrix.config.eval-conc }} diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp4/1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/b200-fp4/1k1k.yaml new file mode 100644 index 000000000..b08193bcb --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/b200-fp4/1k1k.yaml @@ -0,0 +1,259 @@ +# B200-FP4 1k1k — STP and MTP in one file +# +# Two inference modes distinguished by override key names: +# zip_override_stp_* — standard token prediction (no speculative decoding) +# zip_override_mtp_* — multi-token prediction (EAGLE speculative decoding) +# +# Low-latency variants: tep8 decode (DP=1), dep4 prefill (DP=4 TP=4) +# Max-throughput variants: dep8 decode (DP=8), adds SGLANG_MOE_NVFP4_DISPATCH +# +# Note: max-tpt 1d has max-running-requests=1024; max-tpt 2d keeps 512. +# MTP max-tpt 1d additionally uses mem-fraction=0.75 for decode. +# +# Usage: +# srtctl apply -f recipes/b200-fp4/1k1k.yaml # all 8 variants +# srtctl apply -f recipes/b200-fp4/1k1k.yaml:*stp* # all STP variants +# srtctl apply -f recipes/b200-fp4/1k1k.yaml:*mtp* # all MTP variants +# srtctl apply -f recipes/b200-fp4/1k1k.yaml:zip_override_stp_lowlat[0] # STP 1p5d only +# srtctl dry-run -f recipes/b200-fp4/1k1k.yaml # preview + +base: + name: "b200-fp4-stp-1k1k" + + model: + path: "dsr1" + container: "dynamo-sglang" + precision: "fp4" + + resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + decode_nodes: 5 + decode_workers: 5 + gpus_per_node: 8 + + backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + DYN_REQUEST_PLANE: nats + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + DYN_REQUEST_PLANE: nats + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "modelopt_fp4" + + # Disaggregation mode + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 2200 + max-running-requests: 512 + disable-cuda-graph: true + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "modelopt_fp4" + + # Disaggregation mode + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 2200 + max-running-requests: 512 + cuda-graph-max-bs: 512 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 8 + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + health_check: + max_attempts: 360 + interval_seconds: 10 + + benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + req_rate: "inf" + + +# STP low-latency: tep8 decode (DP=1), scale sweep 1p5d and 1p6d +zip_override_stp_lowlat: + name: + - "b200-fp4-stp-low-latency-dep4-1p-tep8-5d" + - "b200-fp4-stp-low-latency-dep4-1p-tep8-6d" + resources: + decode_nodes: [5, 6] + decode_workers: [5, 6] + benchmark: + concurrencies: ["16x128", "32x64x256"] + + +# MTP low-latency: same scales as STP, adds EAGLE speculative decoding + fp4-gemm-backend +zip_override_mtp_lowlat: + name: + - "b200-fp4-mtp-low-latency-dep4-1p-tep8-5d" + - "b200-fp4-mtp-low-latency-dep4-1p-tep8-6d" + resources: + decode_nodes: [5, 6] + decode_workers: [5, 6] + backend: + prefill_environment: + SGLANG_ENABLE_SPEC_V2: "1" + decode_environment: + SGLANG_ENABLE_SPEC_V2: "1" + sglang_config: + prefill: + fp4-gemm-backend: "flashinfer_trtllm" + decode: + fp4-gemm-backend: "flashinfer_trtllm" + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + benchmark: + concurrencies: ["16x512", "32x64x256x512"] + + +# STP max-throughput: dep8 decode (DP=8), scale sweep 1p1d and 1p2d +# Adds SGLANG_MOE_NVFP4_DISPATCH + SGLANG_FLASHINFER_FP4_GEMM_BACKEND env vars +# 1d: max-running-requests=1024; 2d: keeps 512 +zip_override_stp_maxtpt: + name: + - "b200-fp4-stp-max-tpt-dep4-1p-dep8-1d" + - "b200-fp4-stp-max-tpt-dep4-1p-dep8-2d" + resources: + decode_nodes: [1, 2] + decode_workers: [1, 2] + backend: + decode_environment: + SGLANG_MOE_NVFP4_DISPATCH: "1" + SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" + sglang_config: + prefill: + max-running-requests: [1024, 512] + decode: + data-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + max-running-requests: [1024, 512] + cuda-graph-max-bs: [1024, 512] + benchmark: + concurrencies: ["512", "512"] + + +# MTP max-throughput: dep8 decode, scale sweep 1p1d and 1p2d, adds EAGLE speculative decoding +# Adds SGLANG_MOE_NVFP4_DISPATCH + SGLANG_FLASHINFER_FP4_GEMM_BACKEND + fp4-gemm-backend +# 1d: max-running-requests=1024, mem-fraction=0.75 for decode; 2d: keeps 512/0.85 +zip_override_mtp_maxtpt: + name: + - "b200-fp4-mtp-max-tpt-dep4-1p-dep8-1d" + - "b200-fp4-mtp-max-tpt-dep4-1p-dep8-2d" + resources: + decode_nodes: [1, 2] + decode_workers: [1, 2] + backend: + prefill_environment: + SGLANG_ENABLE_SPEC_V2: "1" + decode_environment: + SGLANG_MOE_NVFP4_DISPATCH: "1" + SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" + SGLANG_ENABLE_SPEC_V2: "1" + sglang_config: + prefill: + fp4-gemm-backend: "flashinfer_trtllm" + max-running-requests: [1024, 512] + decode: + fp4-gemm-backend: "flashinfer_trtllm" + mem-fraction-static: [0.75, 0.85] + data-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + max-running-requests: [1024, 512] + cuda-graph-max-bs: [1024, 512] + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + benchmark: + concurrencies: ["512x1024", "512"] diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp4/8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/b200-fp4/8k1k.yaml new file mode 100644 index 000000000..f5bfc9641 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/b200-fp4/8k1k.yaml @@ -0,0 +1,351 @@ +# B200-FP4 8k1k — STP and MTP in one file +# +# Three modes distinguished by override key names: +# override_stp_tp4 / override_mtp_tp4: TP4 prefill (DP=1, EP=1) — low-latency single-node +# zip_override_stp_lowlat / zip_override_mtp_lowlat: dep4 prefill + tep8 decode (DP=1) +# override_stp_maxtpt_7p2d / override_mtp_maxtpt_7p2d: dep4 prefill + dep8 decode, 7p2d +# override_mtp_maxtpt_4p1d: MTP-only 4p1d, no frontends, env-var FP4 backend +# +# Usage: +# srtctl apply -f recipes/b200-fp4/8k1k.yaml # all 11 variants +# srtctl apply -f recipes/b200-fp4/8k1k.yaml:*stp* # all STP variants +# srtctl apply -f recipes/b200-fp4/8k1k.yaml:*mtp* # all MTP variants +# srtctl apply -f recipes/b200-fp4/8k1k.yaml:override_stp_tp4 # STP tp4 only +# srtctl apply -f recipes/b200-fp4/8k1k.yaml:zip_override_stp_lowlat[0] # STP 1p1d only +# srtctl dry-run -f recipes/b200-fp4/8k1k.yaml # preview + +base: + name: "b200-fp4-stp-8k1k" + + dynamo: + version: 0.8.1 + + model: + path: "dsr1" + container: "dynamo-sglang" + precision: "fp4" + + frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 4 + + resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + decode_nodes: 1 + decode_workers: 1 + gpus_per_node: 8 + + backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + DYN_REQUEST_PLANE: nats + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + DYN_REQUEST_PLANE: nats + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "modelopt_fp4" + + # Disaggregation mode + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 9600 + max-running-requests: 512 + disable-cuda-graph: true + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + moe-dense-tp-size: 1 + fp4-gemm-backend: "flashinfer_trtllm" + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "modelopt_fp4" + + # Disaggregation mode + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 9600 + max-running-requests: 512 + cuda-graph-max-bs: 512 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 8 + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + fp4-gemm-backend: "flashinfer_trtllm" + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + health_check: + max_attempts: 360 + interval_seconds: 10 + + benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + req_rate: "inf" + + +# STP TP4 prefill mode: TP4 (DP=1, EP=1) instead of dep4 — low-latency single-node +override_stp_tp4: + name: "b200-fp4-stp-low-latency-tp4-1p-tp8-1d" + frontend: + num_additional_frontends: 2 + backend: + sglang_config: + prefill: + data-parallel-size: 1 + expert-parallel-size: 1 + enable-dp-attention: null + enable-dp-lm-head: null + decode: + expert-parallel-size: 1 + benchmark: + concurrencies: "4x8x16x64" + + +# MTP TP4 prefill mode: same as STP tp4 but adds EAGLE speculative decoding +override_mtp_tp4: + name: "b200-fp4-mtp-low-latency-tp4-1p-tp8-1d" + frontend: + num_additional_frontends: 2 + backend: + prefill_environment: + SGLANG_ENABLE_SPEC_V2: "1" + decode_environment: + SGLANG_ENABLE_SPEC_V2: "1" + sglang_config: + prefill: + data-parallel-size: 1 + expert-parallel-size: 1 + enable-dp-attention: null + enable-dp-lm-head: null + decode: + expert-parallel-size: 1 + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + benchmark: + concurrencies: "4x8x16x64" + + +# STP low-latency: dep4 prefill + tep8 decode (DP=1), scale sweep 1p1d/1p5d/2p5d +zip_override_stp_lowlat: + name: + - "b200-fp4-stp-low-latency-dep4-1p-tep8-1d" + - "b200-fp4-stp-low-latency-dep4-1p-tep8-5d" + - "b200-fp4-stp-low-latency-dep4-2p-tep8-5d" + resources: + prefill_nodes: [1, 1, 2] + prefill_workers: [1, 1, 2] + decode_nodes: [1, 5, 5] + decode_workers: [1, 5, 5] + benchmark: + concurrencies: ["64x128", "8", "4x128"] + + +# MTP low-latency: same scales as STP, adds EAGLE speculative decoding +zip_override_mtp_lowlat: + name: + - "b200-fp4-mtp-low-latency-dep4-1p-tep8-1d" + - "b200-fp4-mtp-low-latency-dep4-1p-tep8-5d" + - "b200-fp4-mtp-low-latency-dep4-2p-tep8-5d" + resources: + prefill_nodes: [1, 1, 2] + prefill_workers: [1, 1, 2] + decode_nodes: [1, 5, 5] + decode_workers: [1, 5, 5] + backend: + prefill_environment: + SGLANG_ENABLE_SPEC_V2: "1" + decode_environment: + SGLANG_ENABLE_SPEC_V2: "1" + sglang_config: + decode: + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + benchmark: + concurrencies: ["64x128", "8", "4x128"] + + +# STP max-throughput 7p2d: dep4 prefill + dep8 decode, flashinfer_cutlass backend +override_stp_maxtpt_7p2d: + name: "b200-fp4-stp-max-tpt-dep4-7p-dep8-2d" + resources: + prefill_nodes: 7 + prefill_workers: 7 + decode_nodes: 2 + decode_workers: 2 + backend: + decode_environment: + SGLANG_MOE_NVFP4_DISPATCH: "1" + sglang_config: + prefill: + max-prefill-tokens: 65536 + chunked-prefill-size: 65536 + max-running-requests: 1024 + fp4-gemm-backend: "flashinfer_cutlass" + decode: + data-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + max-running-requests: 2048 + cuda-graph-max-bs: 1024 + fp4-gemm-backend: "flashinfer_cutlass" + benchmark: + concurrencies: "1024x2048" + + +# MTP max-throughput 7p2d: same as STP but adds EAGLE speculative decoding +override_mtp_maxtpt_7p2d: + name: "b200-fp4-mtp-max-tpt-dep4-7p-dep8-2d" + resources: + prefill_nodes: 7 + prefill_workers: 7 + decode_nodes: 2 + decode_workers: 2 + backend: + prefill_environment: + SGLANG_ENABLE_SPEC_V2: "1" + decode_environment: + SGLANG_MOE_NVFP4_DISPATCH: "1" + SGLANG_ENABLE_SPEC_V2: "1" + sglang_config: + prefill: + max-prefill-tokens: 65536 + chunked-prefill-size: 65536 + max-running-requests: 1024 + fp4-gemm-backend: "flashinfer_cutlass" + decode: + data-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + max-running-requests: 2048 + cuda-graph-max-bs: 1024 + fp4-gemm-backend: "flashinfer_cutlass" + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + benchmark: + concurrencies: "1024x2048" + + +# MTP-only: 4p1d, no frontends, SGLANG_FLASHINFER_FP4_GEMM_BACKEND env var (fp4-gemm-backend: null +# removes the sglang_config key), mem-fraction=0.75 for decode +override_mtp_maxtpt_4p1d: + name: "b200-fp4-mtp-max-tpt-dep4-4p-dep8-1d" + dynamo: null + frontend: null + resources: + prefill_nodes: 4 + prefill_workers: 4 + decode_nodes: 1 + decode_workers: 1 + backend: + prefill_environment: + SGLANG_ENABLE_SPEC_V2: "1" + decode_environment: + SGLANG_MOE_NVFP4_DISPATCH: "1" + SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" + SGLANG_ENABLE_SPEC_V2: "1" + sglang_config: + prefill: + max-running-requests: 1024 + fp4-gemm-backend: null + decode: + mem-fraction-static: 0.75 + data-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + fp4-gemm-backend: null + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + benchmark: + concurrencies: "1024" diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/1k1k.yaml new file mode 100644 index 000000000..7489586aa --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/1k1k.yaml @@ -0,0 +1,281 @@ +# B200-FP8 1k1k — STP and MTP in one file +# +# Two inference modes distinguished by override key names: +# zip_override_stp_* — standard token prediction (no speculative decoding) +# zip_override_mtp_* — multi-token prediction (EAGLE speculative decoding) +# +# Low-latency variants: tep8 decode (DP=1) +# Max-throughput variants: dep8 decode (DP=8) +# +# Usage: +# srtctl apply -f recipes/b200-fp8/1k1k.yaml # all 10 variants +# srtctl apply -f recipes/b200-fp8/1k1k.yaml:*stp* # all STP variants +# srtctl apply -f recipes/b200-fp8/1k1k.yaml:*mtp* # all MTP variants +# srtctl apply -f recipes/b200-fp8/1k1k.yaml:zip_override_stp_lowlat[0] # STP 1p1d only +# srtctl dry-run -f recipes/b200-fp8/1k1k.yaml # preview + +base: + name: "b200-fp8-stp-1k1k" + + model: + path: "dsr1-fp8" + container: "dynamo-sglang" + precision: "fp8" + + resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 1 + decode_workers: 1 + gpus_per_node: 8 + + backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + DYN_REQUEST_PLANE: nats + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + DYN_REQUEST_PLANE: nats + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "fp8" + + # Disaggregation mode + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 2200 + max-running-requests: 512 + disable-cuda-graph: true + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 8 + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + # moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + quantization: "fp8" + + # Disaggregation mode + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + context-length: 2200 + max-running-requests: 512 + cuda-graph-max-bs: 512 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 8 + + # Attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # MoE + moe-runner-backend: "flashinfer_trtllm" + # moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + # disable-chunked-prefix-cache: true + + health_check: + max_attempts: 360 + interval_seconds: 10 + + benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + req_rate: "inf" + + +# STP low-latency: tep8 decode (DP=1), scale sweep 1p1d and 1p3d +zip_override_stp_lowlat: + name: + - "b200-fp8-stp-low-latency-tep8-1p-1d" + - "b200-fp8-stp-low-latency-tep8-1p-3d" + resources: + decode_nodes: [1, 3] + decode_workers: [1, 3] + benchmark: + concurrencies: ["4", "16x32x64x128x256"] + + +# MTP low-latency: same scales as STP, adds EAGLE speculative decoding +zip_override_mtp_lowlat: + name: + - "b200-fp8-mtp-low-latency-tep8-1p-1d" + - "b200-fp8-mtp-low-latency-tep8-1p-3d" + resources: + decode_nodes: [1, 3] + decode_workers: [1, 3] + backend: + prefill_environment: + SGLANG_ENABLE_SPEC_V2: "1" + decode_environment: + SGLANG_ENABLE_SPEC_V2: "1" + sglang_config: + prefill: + moe-dense-tp-size: 1 + decode: + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + benchmark: + concurrencies: ["4x64", "4x8x16x32x128"] + + +# STP max-throughput: dep8 decode (DP=8), scale sweep 1p5d and 2p5d +zip_override_stp_maxtpt: + name: + - "b200-fp8-stp-max-tpt-dep8-1p-5d" + - "b200-fp8-stp-max-tpt-dep8-2p-5d" + resources: + prefill_nodes: [1, 2] + prefill_workers: [1, 2] + decode_nodes: [5, 5] + decode_workers: [5, 5] + backend: + sglang_config: + prefill: + data-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + max-running-requests: 1024 + decode: + data-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + benchmark: + concurrencies: ["1024", "2048"] + + +# MTP max-throughput: dep8 decode, scale sweep 1p1d/1p5d/2p5d, adds EAGLE speculative decoding +# Note: max-running-requests stays at 512 for MTP (unlike STP which raises to 1024) +zip_override_mtp_maxtpt: + name: + - "b200-fp8-mtp-max-tpt-dep8-1p-1d" + - "b200-fp8-mtp-max-tpt-dep8-1p-5d" + - "b200-fp8-mtp-max-tpt-dep8-2p-5d" + resources: + prefill_nodes: [1, 1, 2] + prefill_workers: [1, 1, 2] + decode_nodes: [1, 5, 5] + decode_workers: [1, 5, 5] + backend: + prefill_environment: + SGLANG_ENABLE_SPEC_V2: "1" + decode_environment: + SGLANG_ENABLE_SPEC_V2: "1" + sglang_config: + prefill: + data-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + decode: + data-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + benchmark: + concurrencies: ["512x1024x2048x4096", "512x4096", "1024x2048x4096"] + + +# MTP special case: 1p2d uses speculative-num-steps=1 and draft-tokens=2 (vs 2/3 for all others) +override_mtp_maxtpt_1p2d: + name: "b200-fp8-mtp-max-tpt-dep8-1p-2d" + resources: + decode_nodes: 2 + decode_workers: 2 + backend: + prefill_environment: + SGLANG_ENABLE_SPEC_V2: "1" + decode_environment: + SGLANG_ENABLE_SPEC_V2: "1" + sglang_config: + prefill: + data-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + decode: + data-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + speculative-algorithm: "EAGLE" + speculative-num-steps: 1 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 2 + benchmark: + concurrencies: "512x1024x2048" diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_lowlat_0.yaml new file mode 100644 index 000000000..3c1f465fa --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_lowlat_0.yaml @@ -0,0 +1,141 @@ +name: b200-fp8-mtp-low-latency-tep8-1p-1d + +dynamo: + version: 0.9.1 + +model: + path: dsr1-fp8 + container: dynamo-sglang + precision: fp8 + +resources: + gpu_type: b200 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 3 + decode_workers: 3 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + DYN_REQUEST_PLANE: nats + CUDA_SCALE_LAUNCH_QUEUES: 4x + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' + SGLANG_ENABLE_SPEC_V2: '1' + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + DYN_REQUEST_PLANE: nats + CUDA_SCALE_LAUNCH_QUEUES: 4x + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' + SGLANG_ENABLE_SPEC_V2: '1' + sglang_config: + prefill: + # Model configuration + served-model-name: deepseek-ai/DeepSeek-R1 + trust-remote-code: true + quantization: fp8 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + load-balance-method: round_robin + + # Memory and token limits + mem-fraction-static: 0.6 + max-prefill-tokens: 8192 + chunked-prefill-size: 65536 + max-running-requests: 8 + context-length: 9600 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: trtllm_mla + kv-cache-dtype: fp8_e4m3 + + # MoE + moe-runner-backend: flashinfer_trtllm + moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + decode: + # Model configuration + served-model-name: deepseek-ai/DeepSeek-R1 + trust-remote-code: true + quantization: fp8 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.75 + context-length: 9600 + max-running-requests: 32 + cuda-graph-max-bs: 32 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 1 + + # Attention + attention-backend: trtllm_mla + kv-cache-dtype: fp8_e4m3 + + # MoE + moe-runner-backend: flashinfer_trtllm + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + speculative-algorithm: EAGLE + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 +health_check: + max_attempts: 720 + interval_seconds: 10 + +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + req_rate: inf + concurrencies: '128' diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_lowlat_1.yaml new file mode 100644 index 000000000..51671712c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_lowlat_1.yaml @@ -0,0 +1,141 @@ +name: b200-fp8-mtp-low-latency-tep8-1p-4d + +dynamo: + version: 0.9.1 + +model: + path: dsr1-fp8 + container: dynamo-sglang + precision: fp8 + +resources: + gpu_type: b200 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 4 + decode_workers: 4 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + DYN_REQUEST_PLANE: nats + CUDA_SCALE_LAUNCH_QUEUES: 4x + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' + SGLANG_ENABLE_SPEC_V2: '1' + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + DYN_REQUEST_PLANE: nats + CUDA_SCALE_LAUNCH_QUEUES: 4x + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' + SGLANG_ENABLE_SPEC_V2: '1' + sglang_config: + prefill: + # Model configuration + served-model-name: deepseek-ai/DeepSeek-R1 + trust-remote-code: true + quantization: fp8 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + load-balance-method: round_robin + + # Memory and token limits + mem-fraction-static: 0.6 + max-prefill-tokens: 8192 + chunked-prefill-size: 65536 + max-running-requests: 8 + context-length: 9600 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: trtllm_mla + kv-cache-dtype: fp8_e4m3 + + # MoE + moe-runner-backend: flashinfer_trtllm + moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + decode: + # Model configuration + served-model-name: deepseek-ai/DeepSeek-R1 + trust-remote-code: true + quantization: fp8 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.75 + context-length: 9600 + max-running-requests: 32 + cuda-graph-max-bs: 32 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 1 + + # Attention + attention-backend: trtllm_mla + kv-cache-dtype: fp8_e4m3 + + # MoE + moe-runner-backend: flashinfer_trtllm + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + speculative-algorithm: EAGLE + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 +health_check: + max_attempts: 720 + interval_seconds: 10 + +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + req_rate: inf + concurrencies: '128' diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_lowlat_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_lowlat_2.yaml new file mode 100644 index 000000000..27dbbe30d --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_lowlat_2.yaml @@ -0,0 +1,141 @@ +name: b200-fp8-mtp-low-latency-tep8-1p-6d + +dynamo: + version: 0.9.1 + +model: + path: dsr1-fp8 + container: dynamo-sglang + precision: fp8 + +resources: + gpu_type: b200 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 6 + decode_workers: 6 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + DYN_REQUEST_PLANE: nats + CUDA_SCALE_LAUNCH_QUEUES: 4x + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' + SGLANG_ENABLE_SPEC_V2: '1' + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + DYN_REQUEST_PLANE: nats + CUDA_SCALE_LAUNCH_QUEUES: 4x + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' + SGLANG_ENABLE_SPEC_V2: '1' + sglang_config: + prefill: + # Model configuration + served-model-name: deepseek-ai/DeepSeek-R1 + trust-remote-code: true + quantization: fp8 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + load-balance-method: round_robin + + # Memory and token limits + mem-fraction-static: 0.6 + max-prefill-tokens: 8192 + chunked-prefill-size: 65536 + max-running-requests: 8 + context-length: 9600 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: trtllm_mla + kv-cache-dtype: fp8_e4m3 + + # MoE + moe-runner-backend: flashinfer_trtllm + moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + decode: + # Model configuration + served-model-name: deepseek-ai/DeepSeek-R1 + trust-remote-code: true + quantization: fp8 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.75 + context-length: 9600 + max-running-requests: 22 + cuda-graph-max-bs: 22 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 1 + + # Attention + attention-backend: trtllm_mla + kv-cache-dtype: fp8_e4m3 + + # MoE + moe-runner-backend: flashinfer_trtllm + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + speculative-algorithm: EAGLE + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 +health_check: + max_attempts: 720 + interval_seconds: 10 + +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + req_rate: inf + concurrencies: 8x16x32x64x128 diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_0.yaml new file mode 100644 index 000000000..e5eefa2d2 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_0.yaml @@ -0,0 +1,144 @@ +name: b200-fp8-mtp-max-tpt-dep8-1p-1d + +dynamo: + version: 0.9.1 + +model: + path: dsr1-fp8 + container: dynamo-sglang + precision: fp8 + +resources: + gpu_type: b200 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 2 + decode_workers: 2 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + DYN_REQUEST_PLANE: nats + CUDA_SCALE_LAUNCH_QUEUES: 4x + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' + SGLANG_ENABLE_SPEC_V2: '1' + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + DYN_REQUEST_PLANE: nats + CUDA_SCALE_LAUNCH_QUEUES: 4x + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' + SGLANG_ENABLE_SPEC_V2: '1' + sglang_config: + prefill: + # Model configuration + served-model-name: deepseek-ai/DeepSeek-R1 + trust-remote-code: true + quantization: fp8 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + load-balance-method: round_robin + + # Memory and token limits + mem-fraction-static: 0.6 + max-prefill-tokens: 8192 + chunked-prefill-size: 65536 + max-running-requests: 8 + context-length: 9600 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: trtllm_mla + kv-cache-dtype: fp8_e4m3 + + # MoE + moe-runner-backend: flashinfer_trtllm + moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + decode: + # Model configuration + served-model-name: deepseek-ai/DeepSeek-R1 + trust-remote-code: true + quantization: fp8 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.75 + context-length: 9600 + max-running-requests: 128 + cuda-graph-max-bs: 16 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + + # Attention + attention-backend: trtllm_mla + kv-cache-dtype: fp8_e4m3 + + # MoE + moe-runner-backend: flashinfer_trtllm + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + speculative-algorithm: EAGLE + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 +health_check: + max_attempts: 720 + interval_seconds: 10 + +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + req_rate: inf + concurrencies: '288' diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_1.yaml new file mode 100644 index 000000000..fe0cd9a9f --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_1.yaml @@ -0,0 +1,144 @@ +name: b200-fp8-mtp-max-tpt-dep8-1p-2d + +dynamo: + version: 0.9.1 + +model: + path: dsr1-fp8 + container: dynamo-sglang + precision: fp8 + +resources: + gpu_type: b200 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 1 + decode_workers: 1 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + DYN_REQUEST_PLANE: nats + CUDA_SCALE_LAUNCH_QUEUES: 4x + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' + SGLANG_ENABLE_SPEC_V2: '1' + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + DYN_REQUEST_PLANE: nats + CUDA_SCALE_LAUNCH_QUEUES: 4x + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' + SGLANG_ENABLE_SPEC_V2: '1' + sglang_config: + prefill: + # Model configuration + served-model-name: deepseek-ai/DeepSeek-R1 + trust-remote-code: true + quantization: fp8 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + load-balance-method: round_robin + + # Memory and token limits + mem-fraction-static: 0.6 + max-prefill-tokens: 8192 + chunked-prefill-size: 65536 + max-running-requests: 8 + context-length: 9600 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: trtllm_mla + kv-cache-dtype: fp8_e4m3 + + # MoE + moe-runner-backend: flashinfer_trtllm + moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + decode: + # Model configuration + served-model-name: deepseek-ai/DeepSeek-R1 + trust-remote-code: true + quantization: fp8 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.75 + context-length: 9600 + max-running-requests: 256 + cuda-graph-max-bs: 32 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + + # Attention + attention-backend: trtllm_mla + kv-cache-dtype: fp8_e4m3 + + # MoE + moe-runner-backend: flashinfer_trtllm + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + speculative-algorithm: EAGLE + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 +health_check: + max_attempts: 720 + interval_seconds: 10 + +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + req_rate: inf + concurrencies: 160x288 diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_2.yaml new file mode 100644 index 000000000..7d050ff12 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_2.yaml @@ -0,0 +1,144 @@ +name: b200-fp8-mtp-max-tpt-dep8-2p-1d + +dynamo: + version: 0.9.1 + +model: + path: dsr1-fp8 + container: dynamo-sglang + precision: fp8 + +resources: + gpu_type: b200 + prefill_nodes: 2 + prefill_workers: 2 + decode_nodes: 1 + decode_workers: 1 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + DYN_REQUEST_PLANE: nats + CUDA_SCALE_LAUNCH_QUEUES: 4x + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' + SGLANG_ENABLE_SPEC_V2: '1' + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + DYN_REQUEST_PLANE: nats + CUDA_SCALE_LAUNCH_QUEUES: 4x + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' + SGLANG_ENABLE_SPEC_V2: '1' + sglang_config: + prefill: + # Model configuration + served-model-name: deepseek-ai/DeepSeek-R1 + trust-remote-code: true + quantization: fp8 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + load-balance-method: round_robin + + # Memory and token limits + mem-fraction-static: 0.6 + max-prefill-tokens: 8192 + chunked-prefill-size: 65536 + max-running-requests: 8 + context-length: 9600 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: trtllm_mla + kv-cache-dtype: fp8_e4m3 + + # MoE + moe-runner-backend: flashinfer_trtllm + moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + decode: + # Model configuration + served-model-name: deepseek-ai/DeepSeek-R1 + trust-remote-code: true + quantization: fp8 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.75 + context-length: 9600 + max-running-requests: 512 + cuda-graph-max-bs: 64 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + + # Attention + attention-backend: trtllm_mla + kv-cache-dtype: fp8_e4m3 + + # MoE + moe-runner-backend: flashinfer_trtllm + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + speculative-algorithm: EAGLE + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 +health_check: + max_attempts: 720 + interval_seconds: 10 + +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + req_rate: inf + concurrencies: '512' diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_3.yaml new file mode 100644 index 000000000..e687ccf84 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_3.yaml @@ -0,0 +1,144 @@ +name: b200-fp8-mtp-max-tpt-dep8-3p-1d + +dynamo: + version: 0.9.1 + +model: + path: dsr1-fp8 + container: dynamo-sglang + precision: fp8 + +resources: + gpu_type: b200 + prefill_nodes: 3 + prefill_workers: 3 + decode_nodes: 1 + decode_workers: 1 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + DYN_REQUEST_PLANE: nats + CUDA_SCALE_LAUNCH_QUEUES: 4x + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' + SGLANG_ENABLE_SPEC_V2: '1' + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + DYN_REQUEST_PLANE: nats + CUDA_SCALE_LAUNCH_QUEUES: 4x + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' + SGLANG_ENABLE_SPEC_V2: '1' + sglang_config: + prefill: + # Model configuration + served-model-name: deepseek-ai/DeepSeek-R1 + trust-remote-code: true + quantization: fp8 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + load-balance-method: round_robin + + # Memory and token limits + mem-fraction-static: 0.6 + max-prefill-tokens: 8192 + chunked-prefill-size: 65536 + max-running-requests: 8 + context-length: 9600 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: trtllm_mla + kv-cache-dtype: fp8_e4m3 + + # MoE + moe-runner-backend: flashinfer_trtllm + moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + decode: + # Model configuration + served-model-name: deepseek-ai/DeepSeek-R1 + trust-remote-code: true + quantization: fp8 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.75 + context-length: 9600 + max-running-requests: 1024 + cuda-graph-max-bs: 128 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + + # Attention + attention-backend: trtllm_mla + kv-cache-dtype: fp8_e4m3 + + # MoE + moe-runner-backend: flashinfer_trtllm + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + speculative-algorithm: EAGLE + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 +health_check: + max_attempts: 720 + interval_seconds: 10 + +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + req_rate: inf + concurrencies: '1024' diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_lowlat_0.yaml new file mode 100644 index 000000000..894cef0c7 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_lowlat_0.yaml @@ -0,0 +1,139 @@ +name: b200-fp8-stp-low-latency-tp8-1p-3d + +dynamo: + version: 0.9.1 + +model: + path: dsr1-fp8 + container: dynamo-sglang + precision: fp8 + +resources: + gpu_type: b200 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 3 + decode_workers: 3 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + DYN_REQUEST_PLANE: nats + CUDA_SCALE_LAUNCH_QUEUES: 4x + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + DYN_REQUEST_PLANE: nats + CUDA_SCALE_LAUNCH_QUEUES: 4x + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' + + sglang_config: + prefill: + # Model configuration + served-model-name: deepseek-ai/DeepSeek-R1 + trust-remote-code: true + quantization: fp8 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + load-balance-method: round_robin + + # Memory and token limits + mem-fraction-static: 0.6 + max-prefill-tokens: 8192 + chunked-prefill-size: 65536 + max-running-requests: 8 + context-length: 9600 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: trtllm_mla + kv-cache-dtype: fp8_e4m3 + + # MoE + moe-runner-backend: flashinfer_trtllm + moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + decode: + # Model configuration + served-model-name: deepseek-ai/DeepSeek-R1 + trust-remote-code: true + quantization: fp8 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.75 + context-length: 9600 + max-running-requests: 32 + cuda-graph-max-bs: 32 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 1 + + # Attention + attention-backend: trtllm_mla + kv-cache-dtype: fp8_e4m3 + + # MoE + moe-runner-backend: flashinfer_trtllm + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + # disable-chunked-prefix-cache: true + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + req_rate: inf + concurrencies: '128' diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_lowlat_1.yaml new file mode 100644 index 000000000..c05382ef8 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_lowlat_1.yaml @@ -0,0 +1,139 @@ +name: b200-fp8-stp-low-latency-tp8-1p-4d + +dynamo: + version: 0.9.1 + +model: + path: dsr1-fp8 + container: dynamo-sglang + precision: fp8 + +resources: + gpu_type: b200 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 4 + decode_workers: 4 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + DYN_REQUEST_PLANE: nats + CUDA_SCALE_LAUNCH_QUEUES: 4x + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + DYN_REQUEST_PLANE: nats + CUDA_SCALE_LAUNCH_QUEUES: 4x + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' + + sglang_config: + prefill: + # Model configuration + served-model-name: deepseek-ai/DeepSeek-R1 + trust-remote-code: true + quantization: fp8 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + load-balance-method: round_robin + + # Memory and token limits + mem-fraction-static: 0.6 + max-prefill-tokens: 8192 + chunked-prefill-size: 65536 + max-running-requests: 8 + context-length: 9600 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: trtllm_mla + kv-cache-dtype: fp8_e4m3 + + # MoE + moe-runner-backend: flashinfer_trtllm + moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + decode: + # Model configuration + served-model-name: deepseek-ai/DeepSeek-R1 + trust-remote-code: true + quantization: fp8 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.75 + context-length: 9600 + max-running-requests: 32 + cuda-graph-max-bs: 32 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 1 + + # Attention + attention-backend: trtllm_mla + kv-cache-dtype: fp8_e4m3 + + # MoE + moe-runner-backend: flashinfer_trtllm + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + # disable-chunked-prefix-cache: true + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + req_rate: inf + concurrencies: '128' diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_lowlat_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_lowlat_2.yaml new file mode 100644 index 000000000..69e36a289 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_lowlat_2.yaml @@ -0,0 +1,139 @@ +name: b200-fp8-stp-low-latency-tp8-1p-6d + +dynamo: + version: 0.9.1 + +model: + path: dsr1-fp8 + container: dynamo-sglang + precision: fp8 + +resources: + gpu_type: b200 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 6 + decode_workers: 6 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + DYN_REQUEST_PLANE: nats + CUDA_SCALE_LAUNCH_QUEUES: 4x + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + DYN_REQUEST_PLANE: nats + CUDA_SCALE_LAUNCH_QUEUES: 4x + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' + + sglang_config: + prefill: + # Model configuration + served-model-name: deepseek-ai/DeepSeek-R1 + trust-remote-code: true + quantization: fp8 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + load-balance-method: round_robin + + # Memory and token limits + mem-fraction-static: 0.6 + max-prefill-tokens: 8192 + chunked-prefill-size: 65536 + max-running-requests: 8 + context-length: 9600 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: trtllm_mla + kv-cache-dtype: fp8_e4m3 + + # MoE + moe-runner-backend: flashinfer_trtllm + moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + decode: + # Model configuration + served-model-name: deepseek-ai/DeepSeek-R1 + trust-remote-code: true + quantization: fp8 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.75 + context-length: 9600 + max-running-requests: 22 + cuda-graph-max-bs: 22 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 1 + + # Attention + attention-backend: trtllm_mla + kv-cache-dtype: fp8_e4m3 + + # MoE + moe-runner-backend: flashinfer_trtllm + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + # disable-chunked-prefix-cache: true + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + req_rate: inf + concurrencies: 8x16x32x64x128 diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_0.yaml new file mode 100644 index 000000000..9846a1f05 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_0.yaml @@ -0,0 +1,140 @@ +name: b200-fp8-stp-max-tpt-dep8-1p-2d + +dynamo: + version: 0.9.1 + +model: + path: dsr1-fp8 + container: dynamo-sglang + precision: fp8 + +resources: + gpu_type: b200 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 2 + decode_workers: 2 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + DYN_REQUEST_PLANE: nats + CUDA_SCALE_LAUNCH_QUEUES: 4x + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + DYN_REQUEST_PLANE: nats + CUDA_SCALE_LAUNCH_QUEUES: 4x + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' + + sglang_config: + prefill: + # Model configuration + served-model-name: deepseek-ai/DeepSeek-R1 + trust-remote-code: true + quantization: fp8 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + load-balance-method: round_robin + + # Memory and token limits + mem-fraction-static: 0.6 + max-prefill-tokens: 8192 + chunked-prefill-size: 65536 + max-running-requests: 8 + context-length: 9600 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: trtllm_mla + kv-cache-dtype: fp8_e4m3 + + # MoE + moe-runner-backend: flashinfer_trtllm + moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + decode: + # Model configuration + served-model-name: deepseek-ai/DeepSeek-R1 + trust-remote-code: true + quantization: fp8 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.75 + context-length: 9600 + max-running-requests: 128 + cuda-graph-max-bs: 128 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + + # Attention + attention-backend: trtllm_mla + kv-cache-dtype: fp8_e4m3 + + # MoE + moe-runner-backend: flashinfer_trtllm + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + req_rate: inf + concurrencies: '288' diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_1.yaml new file mode 100644 index 000000000..e4eccdeab --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_1.yaml @@ -0,0 +1,140 @@ +name: b200-fp8-stp-max-tpt-dep8-1p-1d + +dynamo: + version: 0.9.1 + +model: + path: dsr1-fp8 + container: dynamo-sglang + precision: fp8 + +resources: + gpu_type: b200 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 1 + decode_workers: 1 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + DYN_REQUEST_PLANE: nats + CUDA_SCALE_LAUNCH_QUEUES: 4x + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + DYN_REQUEST_PLANE: nats + CUDA_SCALE_LAUNCH_QUEUES: 4x + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' + + sglang_config: + prefill: + # Model configuration + served-model-name: deepseek-ai/DeepSeek-R1 + trust-remote-code: true + quantization: fp8 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + load-balance-method: round_robin + + # Memory and token limits + mem-fraction-static: 0.6 + max-prefill-tokens: 8192 + chunked-prefill-size: 65536 + max-running-requests: 8 + context-length: 9600 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: trtllm_mla + kv-cache-dtype: fp8_e4m3 + + # MoE + moe-runner-backend: flashinfer_trtllm + moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + decode: + # Model configuration + served-model-name: deepseek-ai/DeepSeek-R1 + trust-remote-code: true + quantization: fp8 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.75 + context-length: 9600 + max-running-requests: 256 + cuda-graph-max-bs: 256 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + + # Attention + attention-backend: trtllm_mla + kv-cache-dtype: fp8_e4m3 + + # MoE + moe-runner-backend: flashinfer_trtllm + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + req_rate: inf + concurrencies: 160x288 diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_2.yaml new file mode 100644 index 000000000..c4cc2dd33 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_2.yaml @@ -0,0 +1,140 @@ +name: b200-fp8-stp-max-tpt-dep8-2p-1d + +dynamo: + version: 0.9.1 + +model: + path: dsr1-fp8 + container: dynamo-sglang + precision: fp8 + +resources: + gpu_type: b200 + prefill_nodes: 2 + prefill_workers: 2 + decode_nodes: 1 + decode_workers: 1 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + DYN_REQUEST_PLANE: nats + CUDA_SCALE_LAUNCH_QUEUES: 4x + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + DYN_REQUEST_PLANE: nats + CUDA_SCALE_LAUNCH_QUEUES: 4x + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' + + sglang_config: + prefill: + # Model configuration + served-model-name: deepseek-ai/DeepSeek-R1 + trust-remote-code: true + quantization: fp8 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + load-balance-method: round_robin + + # Memory and token limits + mem-fraction-static: 0.6 + max-prefill-tokens: 8192 + chunked-prefill-size: 65536 + max-running-requests: 8 + context-length: 9600 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: trtllm_mla + kv-cache-dtype: fp8_e4m3 + + # MoE + moe-runner-backend: flashinfer_trtllm + moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + decode: + # Model configuration + served-model-name: deepseek-ai/DeepSeek-R1 + trust-remote-code: true + quantization: fp8 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.75 + context-length: 9600 + max-running-requests: 512 + cuda-graph-max-bs: 512 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + + # Attention + attention-backend: trtllm_mla + kv-cache-dtype: fp8_e4m3 + + # MoE + moe-runner-backend: flashinfer_trtllm + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + req_rate: inf + concurrencies: '512' diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_3.yaml new file mode 100644 index 000000000..59cbb8197 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_3.yaml @@ -0,0 +1,140 @@ +name: b200-fp8-stp-max-tpt-dep8-3p-1d + +dynamo: + version: 0.9.1 + +model: + path: dsr1-fp8 + container: dynamo-sglang + precision: fp8 + +resources: + gpu_type: b200 + prefill_nodes: 3 + prefill_workers: 3 + decode_nodes: 1 + decode_workers: 1 + gpus_per_node: 8 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + DYN_REQUEST_PLANE: nats + CUDA_SCALE_LAUNCH_QUEUES: 4x + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + DYN_REQUEST_PLANE: nats + CUDA_SCALE_LAUNCH_QUEUES: 4x + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' + + sglang_config: + prefill: + # Model configuration + served-model-name: deepseek-ai/DeepSeek-R1 + trust-remote-code: true + quantization: fp8 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + load-balance-method: round_robin + + # Memory and token limits + mem-fraction-static: 0.6 + max-prefill-tokens: 8192 + chunked-prefill-size: 65536 + max-running-requests: 8 + context-length: 9600 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + + # Attention + attention-backend: trtllm_mla + kv-cache-dtype: fp8_e4m3 + + # MoE + moe-runner-backend: flashinfer_trtllm + moe-dense-tp-size: 1 + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + + decode: + # Model configuration + served-model-name: deepseek-ai/DeepSeek-R1 + trust-remote-code: true + quantization: fp8 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.75 + context-length: 9600 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + + # Parallelism + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + + # Attention + attention-backend: trtllm_mla + kv-cache-dtype: fp8_e4m3 + + # MoE + moe-runner-backend: flashinfer_trtllm + + # Other flags + stream-interval: 30 + watchdog-timeout: 1000000 + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + req_rate: inf + concurrencies: '1024' diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/1k1k/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/1k1k/low-latency.yaml new file mode 100644 index 000000000..8729aa6fd --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/1k1k/low-latency.yaml @@ -0,0 +1,116 @@ +name: "gb200-fp4-1k1k-low-latency" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 3 + nginx_container: nginx-sqsh + +model: + path: "dsr1-fp4" + container: "dynamo-sglang" + precision: "fp4" + +resources: + gpu_type: "gb200" + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 2 + gpus_per_node: 4 + +backend: + + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + + sglang_config: + prefill: + disaggregation-mode: "prefill" + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + disable-radix-cache: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_trtllm" + stream-interval: 10 + watchdog-timeout: 1000000 + context-length: 2200 + mem-fraction-static: 0.95 + max-total-tokens: 8192 + chunked-prefill-size: 8192 + cuda-graph-max-bs: 256 + max-running-requests: 512 + scheduler-recv-interval: 10 + enable-symm-mem: true + load-balance-method: "round_robin" + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl + fp4-gemm-backend: "flashinfer_trtllm" + data-parallel-size: 1 + tensor-parallel-size: 4 + expert-parallel-size: 1 + + decode: + disaggregation-mode: "decode" + served-model-name: "deepseek-ai/DeepSeek-R1" + prefill-round-robin-balance: true + trust-remote-code: true + disable-radix-cache: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_trtllm" + disaggregation-bootstrap-port: 30001 + stream-interval: 10 + watchdog-timeout: 1000000 + context-length: 2200 + mem-fraction-static: 0.95 + chunked-prefill-size: 8192 + cuda-graph-max-bs: 256 + scheduler-recv-interval: 10 + enable-symm-mem: true + disaggregation-transfer-backend: nixl + fp4-gemm-backend: "flashinfer_trtllm" + tensor-parallel-size: 4 + expert-parallel-size: 1 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4x8x32" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/1k1k/max-tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/1k1k/max-tpt.yaml new file mode 100644 index 000000000..1075c93eb --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/1k1k/max-tpt.yaml @@ -0,0 +1,183 @@ +name: "gb200-fp4-1k1k-max-tpt" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 + nginx_container: nginx-sqsh + +model: + path: "dsr1-fp4" + container: "dynamo-sglang" + precision: "fp4" + +resources: + gpu_type: "gb200" + prefill_nodes: 4 + decode_nodes: 12 + prefill_workers: 4 + decode_workers: 1 + gpus_per_node: 4 + +backend: + + # Prefill-specific environment variables + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + # Decode-specific environment variables + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_MOE_NVFP4_DISPATCH: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + + # KV cache and attention + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + + # Quantization + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_cutlass" + + # Radix cache disabled + disable-radix-cache: true + disable-chunked-prefix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + watchdog-timeout: 1000000 + context-length: 2176 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Prefill-specific mode + disaggregation-mode: "prefill" + + # Memory and token limits + mem-fraction-static: 0.84 + max-total-tokens: 131072 + max-prefill-tokens: 32768 + chunked-prefill-size: 65536 + enable-single-batch-overlap: true + + # Request handling + max-running-requests: 30000 + load-balance-method: "round_robin" + + # Performance optimizations + disable-cuda-graph: true + enable-dp-attention: true + fp4-gemm-backend: "flashinfer_cutlass" + disaggregation-transfer-backend: nixl + + # Parallelism + tp-size: 4 + dp-size: 4 + ep-size: 4 + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + + # KV cache and attention + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + + # Quantization + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_cutedsl" + + # Radix cache disabled + disable-radix-cache: true + disable-chunked-prefix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + watchdog-timeout: 1000000 + context-length: 2176 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Decode-specific mode + disaggregation-mode: "decode" + + # Memory and token limits + mem-fraction-static: 0.83 + max-total-tokens: 3122380 + chunked-prefill-size: 786432 + + # Request handling + max-running-requests: 67584 + enable-single-batch-overlap: true + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + ep-num-redundant-experts: 32 + + # CUDA graphs (extensive batch size list) + cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 1024] + num-reserved-decode-tokens: 112 + + # Additional decode optimizations + moe-dense-tp-size: 1 + enable-dp-lm-head: true + prefill-round-robin-balance: true + enable-dp-attention: true + disaggregation-transfer-backend: nixl + fp4-gemm-backend: "flashinfer_cutlass" + + # Parallelism + tp-size: 48 + dp-size: 48 + ep-size: 48 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "2048x4096" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/1k1k/mid-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/1k1k/mid-curve.yaml new file mode 100644 index 000000000..d8c80dea7 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/1k1k/mid-curve.yaml @@ -0,0 +1,182 @@ +name: "gb200-fp4-1k1k-mid-curve" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 + nginx_container: nginx-sqsh + +model: + path: "dsr1-fp4" + container: "dynamo-sglang" + precision: "fp4" + +resources: + gpu_type: "gb200" + prefill_nodes: 4 + decode_nodes: 8 + prefill_workers: 4 + decode_workers: 1 + gpus_per_node: 4 + +backend: + + # Prefill-specific environment variables + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + # Decode-specific environment variables + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_MOE_NVFP4_DISPATCH: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + + # KV cache and attention + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + + # Quantization + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_cutlass" + + # Radix cache disabled + disable-radix-cache: true + disable-chunked-prefix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + watchdog-timeout: 1000000 + context-length: 2176 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Prefill-specific mode + disaggregation-mode: "prefill" + + # Memory and token limits + mem-fraction-static: 0.84 + max-total-tokens: 131072 + max-prefill-tokens: 32768 + chunked-prefill-size: 65536 + enable-single-batch-overlap: true + + # Request handling + max-running-requests: 30000 + load-balance-method: "round_robin" + + # Performance optimizations + disable-cuda-graph: true + enable-dp-attention: true + fp4-gemm-backend: "flashinfer_cutlass" + disaggregation-transfer-backend: nixl + + # Parallelism + tp-size: 4 + dp-size: 4 + ep-size: 4 + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + + # KV cache and attention + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + + # Quantization + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_cutedsl" + + # Radix cache disabled + disable-radix-cache: true + disable-chunked-prefix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + watchdog-timeout: 1000000 + context-length: 2176 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Decode-specific mode + disaggregation-mode: "decode" + + # Memory and token limits + mem-fraction-static: 0.83 + max-total-tokens: 3122380 + chunked-prefill-size: 786432 + + # Request handling + max-running-requests: 67584 + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + ep-num-redundant-experts: 32 + + # CUDA graphs (extensive batch size list) + cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 1024] + num-reserved-decode-tokens: 112 + + # Additional decode optimizations + moe-dense-tp-size: 1 + enable-dp-lm-head: true + prefill-round-robin-balance: true + enable-dp-attention: true + disaggregation-transfer-backend: nixl + fp4-gemm-backend: "flashinfer_cutlass" + + # Parallelism + tp-size: 32 + dp-size: 32 + ep-size: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "512x2048x4096x8192" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/8k1k/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/8k1k/low-latency.yaml new file mode 100644 index 000000000..14ebda144 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/8k1k/low-latency.yaml @@ -0,0 +1,118 @@ +name: "gb200-fp4-8k1k-low-latency" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 4 + nginx_container: nginx-sqsh + +model: + path: "dsr1-fp4" + container: "dynamo-sglang" + precision: "fp4" + +resources: + gpu_type: "gb200" + prefill_nodes: 1 + decode_nodes: 4 + prefill_workers: 1 + decode_workers: 4 + gpus_per_node: 4 + +backend: + + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + + sglang_config: + prefill: + disaggregation-mode: "prefill" + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + disable-radix-cache: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_trtllm" + stream-interval: 50 + watchdog-timeout: 1000000 + context-length: 9600 + mem-fraction-static: 0.95 + max-total-tokens: 32768 + chunked-prefill-size: 24576 + cuda-graph-max-bs: 256 + max-running-requests: 512 + scheduler-recv-interval: 10 + enable-symm-mem: true + load-balance-method: "round_robin" + disaggregation-bootstrap-port: 30001 + data-parallel-size: 1 + disaggregation-transfer-backend: nixl + fp4-gemm-backend: "flashinfer_trtllm" + tensor-parallel-size: 4 + expert-parallel-size: 1 + enable-dp-attention: false + + decode: + disaggregation-mode: "decode" + served-model-name: "deepseek-ai/DeepSeek-R1" + prefill-round-robin-balance: true + trust-remote-code: true + disable-radix-cache: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_trtllm" + disaggregation-bootstrap-port: 30001 + stream-interval: 50 + watchdog-timeout: 1000000 + context-length: 9600 + mem-fraction-static: 0.95 + chunked-prefill-size: 8192 + cuda-graph-max-bs: 256 + scheduler-recv-interval: 10 + enable-symm-mem: true + disaggregation-transfer-backend: nixl + fp4-gemm-backend: "flashinfer_trtllm" + tensor-parallel-size: 4 + expert-parallel-size: 1 + enable-dp-attention: false + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4x8" + req_rate: 300 diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/8k1k/max-tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/8k1k/max-tpt.yaml new file mode 100644 index 000000000..cf2759871 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/8k1k/max-tpt.yaml @@ -0,0 +1,179 @@ +name: "gb200-fp4-8k1k-max-tpt" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 + nginx_container: nginx-sqsh + +model: + path: "dsr1-fp4" + container: "dynamo-sglang" + precision: "fp4" + +resources: + gpu_type: "gb200" + prefill_nodes: 10 + decode_nodes: 8 + prefill_workers: 10 + decode_workers: 1 + gpus_per_node: 4 + +backend: + + # Prefill-specific environment variables + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + # Decode-specific environment variables + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512" + SGLANG_MOE_NVFP4_DISPATCH: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + + # KV cache and attention + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + + # Quantization + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_trtllm" + + # Radix cache disabled + disable-radix-cache: true + disable-chunked-prefix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + watchdog-timeout: 1000000 + context-length: 9600 + disable-shared-experts-fusion: true + disaggregation-bootstrap-port: 30001 + + # Prefill-specific mode + disaggregation-mode: "prefill" + + # Memory and token limits + mem-fraction-static: 0.95 + max-total-tokens: 131072 + max-prefill-tokens: 524288 + chunked-prefill-size: 131072 + + # Request handling + max-running-requests: 30000 + load-balance-method: "round_robin" + + # Performance optimizations + disable-cuda-graph: true + enable-dp-attention: false + fp4-gemm-backend: "flashinfer_cutlass" + disaggregation-transfer-backend: nixl + + # Parallelism + tp-size: 4 + dp-size: 1 + ep-size: 1 + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + + # KV cache and attention + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + + # Quantization + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_cutedsl" + + # Radix cache disabled + disable-radix-cache: true + disable-chunked-prefix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + watchdog-timeout: 1000000 + context-length: 9600 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Decode-specific mode + disaggregation-mode: "decode" + + # Memory and token limits + mem-fraction-static: 0.83 + max-total-tokens: 524288 + chunked-prefill-size: 24576 + + # Request handling + max-running-requests: 16384 + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + ep-num-redundant-experts: 32 + + cuda-graph-max-bs: 512 + num-reserved-decode-tokens: 112 + + # Additional decode optimizations + moe-dense-tp-size: 1 + enable-dp-lm-head: true + prefill-round-robin-balance: true + enable-dp-attention: true + fp4-gemm-backend: "flashinfer_cutlass" + disaggregation-transfer-backend: nixl + + # Parallelism + tp-size: 32 + dp-size: 32 + ep-size: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2048" + req_rate: 700 diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/8k1k/mid-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/8k1k/mid-curve.yaml new file mode 100644 index 000000000..8380eb5bf --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/8k1k/mid-curve.yaml @@ -0,0 +1,179 @@ +name: "gb200-fp4-8k1k-mid-curve" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 + nginx_container: nginx-sqsh + +model: + path: "dsr1-fp4" + container: "dynamo-sglang" + precision: "fp4" + +resources: + gpu_type: "gb200" + prefill_nodes: 6 + decode_nodes: 12 + prefill_workers: 6 + decode_workers: 1 + gpus_per_node: 4 + +backend: + + # Prefill-specific environment variables + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + # Decode-specific environment variables + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512" + SGLANG_MOE_NVFP4_DISPATCH: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + + # KV cache and attention + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + + # Quantization + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_trtllm" + + # Radix cache disabled + disable-radix-cache: true + disable-chunked-prefix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + watchdog-timeout: 1000000 + context-length: 9600 + disable-shared-experts-fusion: true + disaggregation-bootstrap-port: 30001 + + # Prefill-specific mode + disaggregation-mode: "prefill" + + # Memory and token limits + mem-fraction-static: 0.95 + max-total-tokens: 131072 + max-prefill-tokens: 524288 + chunked-prefill-size: 131072 + + # Request handling + max-running-requests: 30000 + load-balance-method: "round_robin" + + # Performance optimizations + disable-cuda-graph: true + enable-dp-attention: false + fp4-gemm-backend: "flashinfer_cutlass" + disaggregation-transfer-backend: nixl + + # Parallelism + tp-size: 4 + dp-size: 1 + ep-size: 1 + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + + # KV cache and attention + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + + # Quantization + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_cutedsl" + + # Radix cache disabled + disable-radix-cache: true + disable-chunked-prefix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + watchdog-timeout: 1000000 + context-length: 9600 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Decode-specific mode + disaggregation-mode: "decode" + + # Memory and token limits + mem-fraction-static: 0.83 + max-total-tokens: 524288 + chunked-prefill-size: 24576 + + # Request handling + max-running-requests: 16384 + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + ep-num-redundant-experts: 32 + + cuda-graph-max-bs: 512 + num-reserved-decode-tokens: 112 + + # Additional decode optimizations + moe-dense-tp-size: 1 + enable-dp-lm-head: true + prefill-round-robin-balance: true + enable-dp-attention: true + fp4-gemm-backend: "flashinfer_cutlass" + disaggregation-transfer-backend: nixl + + # Parallelism + tp-size: 48 + dp-size: 48 + ep-size: 48 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "512x2048x4096" + req_rate: 700 diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/low-latency.yaml new file mode 100644 index 000000000..155d1664c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/low-latency.yaml @@ -0,0 +1,121 @@ +name: "gb200-fp8-1k1k-low-latency" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 2 + nginx_container: nginx + +model: + path: "dsr1-fp8" + container: "dynamo-sglang" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_node: 4 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_ENABLE_FLASHINFER_GEMM: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "fp8" + moe-runner-backend: "flashinfer_trtllm" + disable-radix-cache: true + stream-interval: 10 + watchdog-timeout: 1000000 + context-length: 2200 + disaggregation-mode: "prefill" + mem-fraction-static: 0.95 + max-total-tokens: 8192 + chunked-prefill-size: 8192 + cuda-graph-max-bs: 128 + max-running-requests: 512 + load-balance-method: "round_robin" + scheduler-recv-interval: 10 + fp8-gemm-backend: "flashinfer_trtllm" + enable-symm-mem: true + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl + + decode: + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "fp8" + moe-runner-backend: "flashinfer_trtllm" + disable-radix-cache: true + stream-interval: 10 + watchdog-timeout: 1000000 + context-length: 2200 + disaggregation-mode: "decode" + mem-fraction-static: 0.95 + chunked-prefill-size: 8192 + cuda-graph-max-bs: 128 + max-running-requests: 128 + scheduler-recv-interval: 10 + enable-symm-mem: true + prefill-round-robin-balance: true + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + fp8-gemm-backend: "flashinfer_trtllm" + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4x8" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/max-tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/max-tpt.yaml new file mode 100644 index 000000000..5d3c91794 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/max-tpt.yaml @@ -0,0 +1,175 @@ +name: "gb200-fp8-1k1k-max-tpt" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 + nginx_container: nginx + +model: + path: "dsr1-fp8" + container: "dynamo-sglang" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 4 + prefill_workers: 2 + decode_nodes: 8 + decode_workers: 1 + gpus_per_node: 4 + +backend: + + # Prefill-specific environment variables + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + # Decode-specific environment variables + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + max-running-requests: 30000 + context-length: 2200 + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Prefill-specific mode + disaggregation-mode: "prefill" + + # Memory and token limits + mem-fraction-static: 0.75 + max-total-tokens: 524288 + chunked-prefill-size: 131072 + + # Request handling + load-balance-method: "round_robin" + + # Performance optimizations + disable-cuda-graph: true + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "normal" + ep-dispatch-algorithm: "dynamic" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + + disaggregation-transfer-backend: nixl + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 32 + dp-size: 32 + ep-size: 32 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + max-running-requests: 45000 + context-length: 2200 + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Decode-specific mode + disaggregation-mode: "decode" + + # Memory and token limits + mem-fraction-static: 0.82 + chunked-prefill-size: 36864 + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + prefill-round-robin-balance: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + + # CUDA graphs + cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768] + cuda-graph-max-bs: 768 + + disaggregation-transfer-backend: nixl + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1024x2048x4096x6144" + req_rate: "inf" + diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/mid-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/mid-curve.yaml new file mode 100644 index 000000000..1f83ed1bd --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/mid-curve.yaml @@ -0,0 +1,174 @@ +name: "gb200-fp8-1k1k-mid-curve" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 + nginx_container: nginx + +model: + path: "dsr1-fp8" + container: "dynamo-sglang" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 6 + prefill_workers: 3 + decode_nodes: 12 + decode_workers: 1 + gpus_per_node: 4 + +backend: + + # Prefill-specific environment variables + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + # Decode-specific environment variables + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + max-running-requests: 30000 + context-length: 2200 + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Prefill-specific mode + disaggregation-mode: "prefill" + + # Memory and token limits + mem-fraction-static: 0.75 + max-total-tokens: 524288 + chunked-prefill-size: 131072 + + # Request handling + load-balance-method: "round_robin" + + # Performance optimizations + disable-cuda-graph: true + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "normal" + ep-dispatch-algorithm: "dynamic" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + disaggregation-transfer-backend: nixl + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 48 + dp-size: 48 + ep-size: 48 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + max-running-requests: 45000 + context-length: 2200 + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Decode-specific mode + disaggregation-mode: "decode" + + # Memory and token limits + mem-fraction-static: 0.82 + chunked-prefill-size: 36864 + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + prefill-round-robin-balance: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + + # CUDA graphs + cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768] + cuda-graph-max-bs: 768 + disaggregation-transfer-backend: nixl + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1024x2048x4096" + req_rate: "inf" + diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/ultra-tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/ultra-tpt.yaml new file mode 100644 index 000000000..08fe2fa90 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/ultra-tpt.yaml @@ -0,0 +1,176 @@ +name: "gb200-fp8-1k1k-ultra-tpt" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 3 + nginx_container: nginx + +model: + path: "dsr1-fp8" + container: "dynamo-sglang" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 2 + prefill_workers: 1 + decode_nodes: 2 + decode_workers: 1 + gpus_per_node: 4 + +backend: + + # Prefill-specific environment variables + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + # Decode-specific environment variables + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "640" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + max-running-requests: 8192 + context-length: 2200 + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Prefill-specific mode + disaggregation-mode: "prefill" + + # Memory and token limits + mem-fraction-static: 0.75 + max-total-tokens: 524288 + chunked-prefill-size: 131072 + + # Request handling + load-balance-method: "round_robin" + + # Performance optimizations + disable-cuda-graph: true + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "normal" + ep-dispatch-algorithm: "dynamic" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + + disaggregation-transfer-backend: nixl + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + max-running-requests: 5120 + context-length: 2200 + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Decode-specific mode + disaggregation-mode: "decode" + + # Memory and token limits + mem-fraction-static: 0.82 + chunked-prefill-size: 36864 + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + prefill-round-robin-balance: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + + # CUDA graphs + cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640] + cuda-graph-max-bs: 640 + + disaggregation-transfer-backend: nixl + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" + diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/8k1k/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/8k1k/low-latency.yaml new file mode 100644 index 000000000..368b03409 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/8k1k/low-latency.yaml @@ -0,0 +1,117 @@ +name: "gb200-fp8-8k1k-low-latency" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 2 + nginx_container: nginx + +model: + path: "dsr1-fp8" + container: "dynamo-sglang" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_node: 4 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "fp8" + moe-runner-backend: "flashinfer_trtllm" + disable-radix-cache: true + watchdog-timeout: 1000000 + context-length: 9600 + disaggregation-mode: "prefill" + mem-fraction-static: 0.8 + max-total-tokens: 32768 + chunked-prefill-size: 24576 + cuda-graph-max-bs: 512 + max-running-requests: 512 + load-balance-method: "round_robin" + scheduler-recv-interval: 10 + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 1 + fp8-gemm-backend: "flashinfer_trtllm" + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl + + decode: + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "fp8" + moe-runner-backend: "flashinfer_trtllm" + disable-radix-cache: true + watchdog-timeout: 1000000 + context-length: 9600 + disaggregation-mode: "decode" + mem-fraction-static: 0.8 + chunked-prefill-size: 8192 + cuda-graph-max-bs: 512 + max-running-requests: 512 + scheduler-recv-interval: 10 + enable-symm-mem: true + prefill-round-robin-balance: true + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 1 + fp8-gemm-backend: "flashinfer_trtllm" + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4x8x16" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/8k1k/max_tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/8k1k/max_tpt.yaml new file mode 100644 index 000000000..f03e34b8d --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/8k1k/max_tpt.yaml @@ -0,0 +1,171 @@ +name: "gb200-8k1k-fp8-max-tpt" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 + nginx_container: nginx + +model: + path: "dsr1-fp8" + container: "dynamo-sglang" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 12 + prefill_workers: 6 + decode_nodes: 6 + decode_workers: 1 + gpus_per_node: 4 + +backend: + + # Prefill-specific environment variables + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + # Decode-specific environment variables + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + max-running-requests: 30000 + context-length: 9300 + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + + # Prefill-specific mode + disaggregation-mode: "prefill" + + # Memory and token limits + mem-fraction-static: 0.80 + max-total-tokens: 524288 + chunked-prefill-size: 131072 + + # Request handling + load-balance-method: "round_robin" + + # Performance optimizations + disable-cuda-graph: true + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "normal" + ep-dispatch-algorithm: "dynamic" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 24 + dp-size: 24 + ep-size: 24 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + max-running-requests: 8192 + context-length: 9300 + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + + # Decode-specific mode + disaggregation-mode: "decode" + + # Memory and token limits + mem-fraction-static: 0.82 + chunked-prefill-size: 36864 + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + prefill-round-robin-balance: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + # CUDA graphs + cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512] + cuda-graph-max-bs: 512 + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2048x4096x6144" + req_rate: "300" diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/8k1k/mid-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/8k1k/mid-curve.yaml new file mode 100644 index 000000000..c822d67f3 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/8k1k/mid-curve.yaml @@ -0,0 +1,170 @@ +name: "gb200-8k1k-fp8-mid-tpt" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 + nginx_container: nginx + +model: + path: "dsr1-fp8" + container: "dynamo-sglang" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 10 + prefill_workers: 5 + decode_nodes: 8 + decode_workers: 1 + gpus_per_node: 4 + +backend: + + # Prefill-specific environment variables + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + # Decode-specific environment variables + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "256" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + max-running-requests: 30000 + context-length: 9300 + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + + # Prefill-specific mode + disaggregation-mode: "prefill" + + # Memory and token limits + mem-fraction-static: 0.80 + max-total-tokens: 524288 + chunked-prefill-size: 131072 + + # Request handling + load-balance-method: "round_robin" + + # Performance optimizations + disable-cuda-graph: true + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "normal" + ep-dispatch-algorithm: "dynamic" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 32 + dp-size: 32 + ep-size: 32 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + max-running-requests: 8192 + context-length: 9300 + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + + # Decode-specific mode + disaggregation-mode: "decode" + + # Memory and token limits + mem-fraction-static: 0.82 + chunked-prefill-size: 36864 + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + prefill-round-robin-balance: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + # CUDA graphs + cuda-graph-max-bs: 256 + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "512x1024x2048x6144" + req_rate: "300" diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/1k1k/low_latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/1k1k/low_latency.yaml new file mode 100644 index 000000000..252eafa2b --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/1k1k/low_latency.yaml @@ -0,0 +1,116 @@ +name: "gb300-fp4-low-latency-1k1k" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 4 + nginx_container: nginx-sqsh + +model: + path: "dsr1" + container: "dynamo-sglang" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 2 + gpus_per_node: 4 + +backend: + + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + + sglang_config: + prefill: + disaggregation-mode: "prefill" + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + disable-radix-cache: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_trtllm" + stream-interval: 10 + watchdog-timeout: 1000000 + context-length: 2200 + mem-fraction-static: 0.95 + max-total-tokens: 8192 + chunked-prefill-size: 8192 + cuda-graph-max-bs: 256 + max-running-requests: 512 + scheduler-recv-interval: 10 + enable-symm-mem: true + load-balance-method: "round_robin" + disaggregation-bootstrap-port: 30001 + data-parallel-size: 1 + tensor-parallel-size: 4 + expert-parallel-size: 1 + fp4-gemm-backend: "flashinfer_trtllm" + disaggregation-transfer-backend: nixl + + decode: + disaggregation-mode: "decode" + served-model-name: "deepseek-ai/DeepSeek-R1" + prefill-round-robin-balance: true + trust-remote-code: true + disable-radix-cache: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_trtllm" + disaggregation-bootstrap-port: 30001 + stream-interval: 10 + watchdog-timeout: 1000000 + context-length: 2200 + mem-fraction-static: 0.95 + chunked-prefill-size: 8192 + cuda-graph-max-bs: 256 + scheduler-recv-interval: 10 + enable-symm-mem: true + tensor-parallel-size: 4 + expert-parallel-size: 1 + fp4-gemm-backend: "flashinfer_trtllm" + disaggregation-transfer-backend: nixl + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4x8x32" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/1k1k/max_tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/1k1k/max_tpt.yaml new file mode 100644 index 000000000..c941651aa --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/1k1k/max_tpt.yaml @@ -0,0 +1,184 @@ +name: "gb300-fp4-max-tpt-1k1k" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 + nginx_container: nginx-sqsh + +model: + path: "dsr1" + container: "dynamo-sglang" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 4 + decode_nodes: 12 + prefill_workers: 4 + decode_workers: 1 + gpus_per_node: 4 + +backend: + + # Prefill-specific environment variables + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + # Decode-specific environment variables + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_MOE_NVFP4_DISPATCH: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + + # KV cache and attention + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + + # Quantization + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_cutlass" + + # Radix cache disabled + disable-radix-cache: true + disable-chunked-prefix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + watchdog-timeout: 1000000 + context-length: 2176 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Prefill-specific mode + disaggregation-mode: "prefill" + + # Memory and token limits + mem-fraction-static: 0.84 + max-total-tokens: 131072 + max-prefill-tokens: 32768 + chunked-prefill-size: 65536 + enable-single-batch-overlap: true + + # Request handling + max-running-requests: 30000 + load-balance-method: "round_robin" + + # Performance optimizations + disable-cuda-graph: true + enable-dp-attention: true + disaggregation-transfer-backend: nixl + fp4-gemm-backend: "flashinfer_cutlass" + + # Parallelism + tp-size: 4 + dp-size: 4 + ep-size: 4 + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + + # KV cache and attention + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + + # Quantization + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_cutedsl" + + # Radix cache disabled + disable-radix-cache: true + disable-chunked-prefix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + watchdog-timeout: 1000000 + context-length: 2176 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Decode-specific mode + disaggregation-mode: "decode" + + # Memory and token limits + mem-fraction-static: 0.83 + max-total-tokens: 3122380 + chunked-prefill-size: 786432 + + # Request handling + max-running-requests: 67584 + enable-single-batch-overlap: true + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + ep-num-redundant-experts: 32 + + # CUDA graphs (extensive batch size list) + cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 1024] + num-reserved-decode-tokens: 112 + + # Additional decode optimizations + moe-dense-tp-size: 1 + enable-dp-lm-head: true + prefill-round-robin-balance: true + enable-dp-attention: true + fp4-gemm-backend: "flashinfer_cutlass" + disaggregation-transfer-backend: nixl + + + # Parallelism + tp-size: 48 + dp-size: 48 + ep-size: 48 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "512x2048x4096x8192" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/1k1k/mid_curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/1k1k/mid_curve.yaml new file mode 100644 index 000000000..15d3b3930 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/1k1k/mid_curve.yaml @@ -0,0 +1,182 @@ +name: "gb300-fp4-mid-curve-1k1k" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 + nginx_container: nginx-sqsh + +model: + path: "dsr1" + container: "dynamo-sglang" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 4 + decode_nodes: 8 + prefill_workers: 4 + decode_workers: 1 + gpus_per_node: 4 + +backend: + + # Prefill-specific environment variables + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + # Decode-specific environment variables + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_MOE_NVFP4_DISPATCH: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + + # KV cache and attention + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + + # Quantization + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_cutlass" + + # Radix cache disabled + disable-radix-cache: true + disable-chunked-prefix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + watchdog-timeout: 1000000 + context-length: 2176 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Prefill-specific mode + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.84 + max-total-tokens: 131072 + max-prefill-tokens: 32768 + chunked-prefill-size: 65536 + enable-single-batch-overlap: true + + # Request handling + max-running-requests: 30000 + load-balance-method: "round_robin" + + # Performance optimizations + disable-cuda-graph: true + enable-dp-attention: true + fp4-gemm-backend: "flashinfer_cutlass" + + # Parallelism + tp-size: 4 + dp-size: 4 + ep-size: 4 + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + + # KV cache and attention + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + + # Quantization + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_cutedsl" + + # Radix cache disabled + disable-radix-cache: true + disable-chunked-prefix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + watchdog-timeout: 1000000 + context-length: 2176 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Decode-specific mode + disaggregation-mode: "decode" + + # Memory and token limits + mem-fraction-static: 0.83 + max-total-tokens: 3122380 + chunked-prefill-size: 786432 + + # Request handling + max-running-requests: 67584 + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + ep-num-redundant-experts: 32 + + # CUDA graphs (extensive batch size list) + cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 1024] + num-reserved-decode-tokens: 112 + + # Additional decode optimizations + moe-dense-tp-size: 1 + enable-dp-lm-head: true + prefill-round-robin-balance: true + enable-dp-attention: true + fp4-gemm-backend: "flashinfer_cutlass" + disaggregation-transfer-backend: nixl + + # Parallelism + tp-size: 32 + dp-size: 32 + ep-size: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "512x2048x4096x8192" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/8k1k/low_latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/8k1k/low_latency.yaml new file mode 100644 index 000000000..d3c61231b --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/8k1k/low_latency.yaml @@ -0,0 +1,119 @@ +name: "gb300-8k1k-fp4-low-latency-8k1k" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 3 + nginx_container: nginx-sqsh + +model: + path: "dsr1" + container: "dynamo-sglang" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + decode_nodes: 4 + prefill_workers: 1 + decode_workers: 4 + gpus_per_node: 4 + +backend: + + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + + sglang_config: + prefill: + disaggregation-mode: "prefill" + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + disable-radix-cache: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_trtllm" + stream-interval: 50 + watchdog-timeout: 1000000 + context-length: 9600 + mem-fraction-static: 0.95 + max-total-tokens: 32768 + chunked-prefill-size: 24576 + cuda-graph-max-bs: 256 + max-running-requests: 512 + scheduler-recv-interval: 10 + enable-symm-mem: true + load-balance-method: "round_robin" + disaggregation-bootstrap-port: 30001 + data-parallel-size: 1 + tensor-parallel-size: 4 + expert-parallel-size: 1 + enable-dp-attention: false + fp4-gemm-backend: "flashinfer_trtllm" + disaggregation-transfer-backend: nixl + + + decode: + disaggregation-mode: "decode" + served-model-name: "deepseek-ai/DeepSeek-R1" + prefill-round-robin-balance: true + trust-remote-code: true + disable-radix-cache: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_trtllm" + disaggregation-bootstrap-port: 30001 + stream-interval: 50 + watchdog-timeout: 1000000 + context-length: 9600 + mem-fraction-static: 0.95 + chunked-prefill-size: 8192 + cuda-graph-max-bs: 128 + scheduler-recv-interval: 10 + enable-symm-mem: true + tensor-parallel-size: 4 + expert-parallel-size: 1 + enable-dp-attention: false + fp4-gemm-backend: "flashinfer_trtllm" + disaggregation-transfer-backend: nixl + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4x8x32x64" + req_rate: 300 diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/8k1k/max_tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/8k1k/max_tpt.yaml new file mode 100644 index 000000000..001311ed7 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/8k1k/max_tpt.yaml @@ -0,0 +1,179 @@ +name: "gb300-fp4-8k1k-max-tpt" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 + nginx_container: nginx-sqsh + +model: + path: "dsr1" + container: "dynamo-sglang" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 10 + decode_nodes: 8 + prefill_workers: 10 + decode_workers: 1 + gpus_per_node: 4 + +backend: + + # Prefill-specific environment variables + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + # Decode-specific environment variables + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512" + SGLANG_MOE_NVFP4_DISPATCH: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + + # KV cache and attention + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + + # Quantization + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_trtllm" + + # Radix cache disabled + disable-radix-cache: true + disable-chunked-prefix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + watchdog-timeout: 1000000 + context-length: 9600 + disable-shared-experts-fusion: true + disaggregation-bootstrap-port: 30001 + + # Prefill-specific mode + disaggregation-mode: "prefill" + + # Memory and token limits + mem-fraction-static: 0.95 + max-total-tokens: 131072 + max-prefill-tokens: 524288 + chunked-prefill-size: 131072 + + # Request handling + max-running-requests: 30000 + load-balance-method: "round_robin" + + # Performance optimizations + disable-cuda-graph: true + enable-dp-attention: false + fp4-gemm-backend: "flashinfer_cutlass" + disaggregation-transfer-backend: nixl + + # Parallelism + tp-size: 4 + dp-size: 1 + ep-size: 1 + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + + # KV cache and attention + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + + # Quantization + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_cutedsl" + + # Radix cache disabled + disable-radix-cache: true + disable-chunked-prefix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + watchdog-timeout: 1000000 + context-length: 9600 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Decode-specific mode + disaggregation-mode: "decode" + + # Memory and token limits + mem-fraction-static: 0.83 + max-total-tokens: 524288 + chunked-prefill-size: 24576 + + # Request handling + max-running-requests: 16384 + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + ep-num-redundant-experts: 32 + + cuda-graph-max-bs: 512 + num-reserved-decode-tokens: 112 + + # Additional decode optimizations + moe-dense-tp-size: 1 + enable-dp-lm-head: true + prefill-round-robin-balance: true + enable-dp-attention: true + fp4-gemm-backend: "flashinfer_cutlass" + disaggregation-transfer-backend: nixl + + # Parallelism + tp-size: 32 + dp-size: 32 + ep-size: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2048" + req_rate: 700 diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/8k1k/mid_curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/8k1k/mid_curve.yaml new file mode 100644 index 000000000..41043ed0d --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/8k1k/mid_curve.yaml @@ -0,0 +1,179 @@ +name: "gb300-fp4-8k1k-mid-curve" + +dynamo: + version: 0.8.1 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 + nginx_container: nginx-sqsh + +model: + path: "dsr1" + container: "dynamo-sglang" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 6 + decode_nodes: 12 + prefill_workers: 6 + decode_workers: 1 + gpus_per_node: 4 + +backend: + + # Prefill-specific environment variables + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + # Decode-specific environment variables + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1" + SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512" + SGLANG_MOE_NVFP4_DISPATCH: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + + # KV cache and attention + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + + # Quantization + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_trtllm" + + # Radix cache disabled + disable-radix-cache: true + disable-chunked-prefix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + watchdog-timeout: 1000000 + context-length: 9600 + disable-shared-experts-fusion: true + disaggregation-bootstrap-port: 30001 + + # Prefill-specific mode + disaggregation-mode: "prefill" + + # Memory and token limits + mem-fraction-static: 0.95 + max-total-tokens: 131072 + max-prefill-tokens: 524288 + chunked-prefill-size: 131072 + + # Request handling + max-running-requests: 30000 + load-balance-method: "round_robin" + + # Performance optimizations + disable-cuda-graph: true + enable-dp-attention: false + fp4-gemm-backend: "flashinfer_cutlass" + disaggregation-transfer-backend: nixl + + # Parallelism + tp-size: 4 + dp-size: 1 + ep-size: 1 + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + + # KV cache and attention + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + + # Quantization + quantization: "modelopt_fp4" + moe-runner-backend: "flashinfer_cutedsl" + + # Radix cache disabled + disable-radix-cache: true + disable-chunked-prefix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + watchdog-timeout: 1000000 + context-length: 9600 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Decode-specific mode + disaggregation-mode: "decode" + + # Memory and token limits + mem-fraction-static: 0.83 + max-total-tokens: 524288 + chunked-prefill-size: 24576 + + # Request handling + max-running-requests: 16384 + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + ep-num-redundant-experts: 32 + + cuda-graph-max-bs: 512 + num-reserved-decode-tokens: 112 + + # Additional decode optimizations + moe-dense-tp-size: 1 + enable-dp-lm-head: true + prefill-round-robin-balance: true + enable-dp-attention: true + fp4-gemm-backend: "flashinfer_cutlass" + disaggregation-transfer-backend: nixl + + # Parallelism + tp-size: 48 + dp-size: 48 + ep-size: 48 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "512x2048x4096" + req_rate: 700 diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/1k1k/stp/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/1k1k/stp/low-latency.yaml new file mode 100644 index 000000000..51628e081 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/1k1k/stp/low-latency.yaml @@ -0,0 +1,122 @@ +name: "gb300-1k1k-fp8-low-latency" + +model: + path: "dsr1-fp8" + container: "dynamo-sglang" + precision: "fp8" + +frontend: + nginx_container: nginx + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + decode_nodes: 4 + prefill_workers: 1 + decode_workers: 4 + gpus_per_node: 4 + +slurm: + time_limit: "02:00:00" + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + # SGLANG_ENABLE_FLASHINFER_GEMM: "1" # deprecated in 0.5.7, --fp8-gemm-backend=flashinfer_trtllm + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + # SGLANG_ENABLE_FLASHINFER_GEMM: "1" # deprecated in 0.5.7, --fp8-gemm-backend=flashinfer_trtllm + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "fp8" + moe-runner-backend: "flashinfer_trtllm" + fp8-gemm-backend: "flashinfer_trtllm" + disable-radix-cache: true + stream-interval: 10 + watchdog-timeout: 1000000 + context-length: 2200 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + mem-fraction-static: 0.95 + max-total-tokens: 8192 + chunked-prefill-size: 8192 + max-prefill-tokens: 8192 + cuda-graph-max-bs: 128 + max-running-requests: 128 + load-balance-method: "round_robin" + scheduler-recv-interval: 10 + enable-flashinfer-allreduce-fusion: false # to save mem + enable-symm-mem: false # to save mem + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + decode: + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "fp8" + moe-runner-backend: "flashinfer_trtllm" + fp8-gemm-backend: "flashinfer_trtllm" + disable-radix-cache: true + stream-interval: 10 + watchdog-timeout: 1000000 + context-length: 2200 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + mem-fraction-static: 0.85 + chunked-prefill-size: -1 # save mem + cuda-graph-max-bs: 128 + max-running-requests: 128 + scheduler-recv-interval: 1 # save mem + enable-flashinfer-allreduce-fusion: false # to save mem + enable-symm-mem: false # to save mem + prefill-round-robin-balance: true + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: [4,8,16,32] + req_rate: "inf" + diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/1k1k/stp/max.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/1k1k/stp/max.yaml new file mode 100644 index 000000000..c88a487b8 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/1k1k/stp/max.yaml @@ -0,0 +1,171 @@ +# GB300 FP8 Max Throughput Configuration + +name: "gb300-1k1k-fp8-max" + +model: + path: "dsr1-fp8" + container: "dynamo-sglang" + precision: "fp8" + +frontend: + nginx_container: nginx + +resources: + gpu_type: "gb300" + prefill_nodes: 2 + prefill_workers: 1 + decode_nodes: 2 + decode_workers: 1 + gpus_per_node: 4 + +backend: + + # Prefill-specific environment variables + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + # Decode-specific environment variables + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + max-running-requests: 30000 + context-length: 2200 + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl + + # Prefill-specific mode + disaggregation-mode: "prefill" + + # Memory and token limits + mem-fraction-static: 0.75 + max-total-tokens: 524288 + chunked-prefill-size: 131072 + + # Request handling + load-balance-method: "round_robin" + + # Performance optimizations + disable-cuda-graph: true + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "normal" + ep-dispatch-algorithm: "dynamic" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + disaggregation-transfer-backend: nixl + + # Parallelism + tp-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + max-running-requests: 45000 + context-length: 2200 + + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Decode-specific mode + disaggregation-mode: "decode" + + # Memory and token limits + mem-fraction-static: 0.82 + chunked-prefill-size: 36864 + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + prefill-round-robin-balance: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + + # CUDA graphs + cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 1024] + cuda-graph-max-bs: 1024 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: [4096,7168,7680] + req_rate: "inf" + diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/1k1k/stp/mid.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/1k1k/stp/mid.yaml new file mode 100644 index 000000000..ee6690285 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/1k1k/stp/mid.yaml @@ -0,0 +1,170 @@ +# GB300 FP8 Mid Throughput Configuration +name: "gb300-1k1k-fp8-mid" + +model: + path: "dsr1-fp8" + container: "dynamo-sglang" + precision: "fp8" + +frontend: + nginx_container: nginx + +resources: + gpu_type: "gb300" + prefill_nodes: 4 + prefill_workers: 2 + decode_nodes: 8 + decode_workers: 1 + gpus_per_node: 4 + +backend: + + # Prefill-specific environment variables + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + # Decode-specific environment variables + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + max-running-requests: 30000 + context-length: 2200 + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl + + # Prefill-specific mode + disaggregation-mode: "prefill" + + # Memory and token limits + mem-fraction-static: 0.75 + max-total-tokens: 524288 + chunked-prefill-size: 131072 + + # Request handling + load-balance-method: "round_robin" + + # Performance optimizations + disable-cuda-graph: true + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "normal" + ep-dispatch-algorithm: "dynamic" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + disaggregation-transfer-backend: nixl + + # Parallelism + tp-size: 32 + dp-size: 32 + ep-size: 32 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + max-running-requests: 45000 + context-length: 2200 + + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Decode-specific mode + disaggregation-mode: "decode" + + # Memory and token limits + mem-fraction-static: 0.82 + chunked-prefill-size: 36864 + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + prefill-round-robin-balance: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + + # CUDA graphs + cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768] + cuda-graph-max-bs: 768 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: [1024,2048,4096,6144] + req_rate: "inf" + diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/8k1k/stp/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/8k1k/stp/low-latency.yaml new file mode 100644 index 000000000..71fd0f889 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/8k1k/stp/low-latency.yaml @@ -0,0 +1,121 @@ +name: "gb300-8k1k-fp8-low-latency" + +model: + path: "dsr1-fp8" + container: "dynamo-sglang" + precision: "fp8" + +frontend: + nginx_container: nginx + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_node: 4 + +slurm: + time_limit: "02:00:00" + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + # SGLANG_ENABLE_FLASHINFER_GEMM: "1" # deprecated in 0.5.7, --fp8-gemm-backend=flashinfer_trtllm + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + PYTHONUNBUFFERED: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + SGLANG_ENABLE_JIT_DEEPGEMM: "false" + # SGLANG_ENABLE_FLASHINFER_GEMM: "1" # deprecated in 0.5.7, --fp8-gemm-backend=flashinfer_trtllm + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + MC_TE_METRIC: "true" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "fp8" + moe-runner-backend: "flashinfer_trtllm" + fp8-gemm-backend: "flashinfer_trtllm" + disable-radix-cache: true + stream-interval: 10 + watchdog-timeout: 1000000 + context-length: 9300 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + mem-fraction-static: 0.95 + max-total-tokens: 32768 + chunked-prefill-size: 32768 + max-prefill-tokens: 32768 + cuda-graph-max-bs: 128 + max-running-requests: 128 + load-balance-method: "round_robin" + scheduler-recv-interval: 10 + enable-flashinfer-allreduce-fusion: false # to save mem + enable-symm-mem: false # to save mem + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + decode: + served-model-name: "deepseek-ai/DeepSeek-R1" + trust-remote-code: true + kv-cache-dtype: "fp8_e4m3" + attention-backend: "trtllm_mla" + quantization: "fp8" + moe-runner-backend: "flashinfer_trtllm" + fp8-gemm-backend: "flashinfer_trtllm" + disable-radix-cache: true + stream-interval: 10 + watchdog-timeout: 1000000 + context-length: 9300 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + mem-fraction-static: 0.85 + chunked-prefill-size: -1 # save mem + cuda-graph-max-bs: 128 + max-running-requests: 128 + scheduler-recv-interval: 1 # save mem + enable-flashinfer-allreduce-fusion: false # to save mem + enable-symm-mem: false # to save mem + prefill-round-robin-balance: true + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: [4,8] + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/8k1k/stp/max.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/8k1k/stp/max.yaml new file mode 100644 index 000000000..6d219cc1e --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/8k1k/stp/max.yaml @@ -0,0 +1,171 @@ +# GB300 FP8 Max Throughput Configuration + +name: "gb300-8k1k-fp8-max" + +model: + path: "dsr1-fp8" + container: "dynamo-sglang" + precision: "fp8" + +frontend: + nginx_container: nginx + +resources: + gpu_type: "gb300" + prefill_nodes: 12 + prefill_workers: 6 + decode_nodes: 6 + decode_workers: 1 + gpus_per_node: 4 + +backend: + + # Prefill-specific environment variables + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + # Decode-specific environment variables + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + max-running-requests: 30000 + context-length: 9300 + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl + + # Prefill-specific mode + disaggregation-mode: "prefill" + + # Memory and token limits + mem-fraction-static: 0.75 + max-total-tokens: 524288 + chunked-prefill-size: 131072 + + # Request handling + load-balance-method: "round_robin" + + # Performance optimizations + disable-cuda-graph: true + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "normal" + ep-dispatch-algorithm: "dynamic" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + disaggregation-transfer-backend: nixl + + # Parallelism + tp-size: 24 + dp-size: 24 + ep-size: 24 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + max-running-requests: 45000 + context-length: 9300 + + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Decode-specific mode + disaggregation-mode: "decode" + + # Memory and token limits + mem-fraction-static: 0.82 + chunked-prefill-size: 36864 + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + prefill-round-robin-balance: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + + # CUDA graphs + cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768] + cuda-graph-max-bs: 768 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: [2048,4096] + req_rate: "inf" + diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/8k1k/stp/mid.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/8k1k/stp/mid.yaml new file mode 100644 index 000000000..b085f50f8 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/8k1k/stp/mid.yaml @@ -0,0 +1,171 @@ +# GB300 FP8 Mid Throughput Configuration + +name: "gb300-8k1k-fp8-mid" + +model: + path: "dsr1-fp8" + container: "dynamo-sglang" + precision: "fp8" + +frontend: + nginx_container: nginx + +resources: + gpu_type: "gb300" + prefill_nodes: 10 + prefill_workers: 5 + decode_nodes: 8 + decode_workers: 1 + gpus_per_node: 4 + +backend: + + # Prefill-specific environment variables + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + # Decode-specific environment variables + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_DG_CACHE_DIR: "/configs/dg-10212025" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768" + MC_TE_METRIC: "true" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + PYTHONUNBUFFERED: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + max-running-requests: 30000 + context-length: 9300 + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl + + # Prefill-specific mode + disaggregation-mode: "prefill" + + # Memory and token limits + mem-fraction-static: 0.75 + max-total-tokens: 524288 + chunked-prefill-size: 131072 + + # Request handling + load-balance-method: "round_robin" + + # Performance optimizations + disable-cuda-graph: true + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "normal" + ep-dispatch-algorithm: "dynamic" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + skip-tokenizer-init: true + trust-remote-code: true + disaggregation-transfer-backend: nixl + + # Parallelism + tp-size: 32 + dp-size: 32 + ep-size: 32 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "trtllm_mla" + kv-cache-dtype: "fp8_e4m3" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + stream-interval: 50 + decode-log-interval: 1000 + max-running-requests: 45000 + context-length: 9300 + + watchdog-timeout: 1000000 + disable-shared-experts-fusion: true + eplb-algorithm: "deepseek" + disaggregation-bootstrap-port: 30001 + + # Decode-specific mode + disaggregation-mode: "decode" + + # Memory and token limits + mem-fraction-static: 0.82 + chunked-prefill-size: 36864 + + # DeepEP configuration + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + moe-dense-tp-size: 1 + enable-dp-lm-head: true + prefill-round-robin-balance: true + ep-num-redundant-experts: 32 + deepep-config: "/configs/deepep_config.json" + + # CUDA graphs + cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768] + cuda-graph-max-bs: 768 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: [128,256,512,1024] + req_rate: "inf" + diff --git a/benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml new file mode 100644 index 000000000..989fc47d1 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml @@ -0,0 +1,114 @@ +name: "h100-fp8-1p1d-max-dep-mtp" + +model: + path: "dsr1-fp8" + container: "lmsysorg/sglang:v0.5.8-cu130" + precision: "fp8" + +frontend: + nginx_container: nginx-sqsh + +resources: + gpu_type: "h100" + prefill_nodes: 2 + prefill_workers: 1 + decode_nodes: 2 + decode_workers: 1 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + + # Decode-specific environment variables + decode_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 16 + dp-size: 1 + ep-size: 1 + enable-dp-attention: false + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Prefill capacity + max-running-requests: 4 + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.6 + max-prefill-tokens: 2048 + chunked-prefill-size: 2048 + + # Request handling + load-balance-method: "round_robin" + + # MTP (Multi-Token Prediction) + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 16 + dp-size: 16 + ep-size: 16 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 1 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-running-requests: 64 + cuda-graph-max-bs: 64 + + # MTP + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1x2x4x8x16x32x64" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/mtp/h100-fp8-1p2d-max-tp-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/mtp/h100-fp8-1p2d-max-tp-mtp.yaml new file mode 100644 index 000000000..0ce17e8a4 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/mtp/h100-fp8-1p2d-max-tp-mtp.yaml @@ -0,0 +1,116 @@ +name: "h100-fp8-1p2d-max-tp-mtp" + +model: + path: "dsr1-fp8" + container: "lmsysorg/sglang:v0.5.8-cu130" + precision: "fp8" + +frontend: + nginx_container: nginx-sqsh + +resources: + gpu_type: "h100" + prefill_nodes: 2 + prefill_workers: 1 + decode_nodes: 4 + decode_workers: 2 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_SPEC_V2: "1" + + # Decode-specific environment variables + decode_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_SPEC_V2: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 16 + dp-size: 1 + ep-size: 1 + enable-dp-attention: false + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + max-running-requests: 2 + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.6 + max-prefill-tokens: 2048 + chunked-prefill-size: 2048 + + # Request handling + load-balance-method: "round_robin" + + # MTP (Multi-Token Prediction) + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 16 + dp-size: 1 + ep-size: 1 + enable-dp-attention: false + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 1 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.9 + max-running-requests: 128 + cuda-graph-max-bs: 128 + + # MTP + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1x2x4x8x16x32x64x128" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/stp/h100-fp8-1p1d-max-dep.yaml b/benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/stp/h100-fp8-1p1d-max-dep.yaml new file mode 100644 index 000000000..c47b6c867 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/stp/h100-fp8-1p1d-max-dep.yaml @@ -0,0 +1,102 @@ +name: "h100-fp8-1p1d-max-dep" + +model: + path: "dsr1-fp8" + container: "lmsysorg/sglang:v0.5.8-cu130" + precision: "fp8" + +resources: + gpu_type: "h100" + prefill_nodes: 2 + prefill_workers: 1 + decode_nodes: 2 + decode_workers: 1 + gpus_per_node: 8 + +frontend: + nginx_container: nginx-sqsh + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + + # Decode-specific environment variables + decode_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 16 + dp-size: 1 + ep-size: 1 + enable-dp-attention: false + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Prefill capacity + max-running-requests: 4 + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.6 + max-prefill-tokens: 2048 + chunked-prefill-size: 2048 + + # Request handling + load-balance-method: "round_robin" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 16 + dp-size: 16 + ep-size: 16 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 1 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.9 + max-running-requests: 64 + cuda-graph-max-bs: 64 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1x2x4x8x16x32x64" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/stp/h100-fp8-1p2d-max-tp.yaml b/benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/stp/h100-fp8-1p2d-max-tp.yaml new file mode 100644 index 000000000..1f7cf9985 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/stp/h100-fp8-1p2d-max-tp.yaml @@ -0,0 +1,102 @@ +name: "h100-fp8-1p2d-max-tp" + +model: + path: "dsr1-fp8" + container: "lmsysorg/sglang:v0.5.8-cu130" + precision: "fp8" + +resources: + gpu_type: "h100" + prefill_nodes: 2 + prefill_workers: 1 + decode_nodes: 4 + decode_workers: 2 + gpus_per_node: 8 + +frontend: + nginx_container: nginx-sqsh + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + + # Decode-specific environment variables + decode_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 16 + dp-size: 1 + ep-size: 1 + enable-dp-attention: false + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + max-running-requests: 2 + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.6 + max-prefill-tokens: 2048 + chunked-prefill-size: 2048 + + # Request handling + load-balance-method: "round_robin" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 16 + dp-size: 1 + ep-size: 1 + enable-dp-attention: false + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 1 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.9 + max-running-requests: 128 + cuda-graph-max-bs: 128 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1x2x4x8x16x32x64x128" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml new file mode 100644 index 000000000..4a0448658 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml @@ -0,0 +1,116 @@ +name: "h100-fp8-1p1d-max-dep-mtp" + +model: + path: "dsr1-fp8" + container: "lmsysorg/sglang:v0.5.8-cu130" + precision: "fp8" + +resources: + gpu_type: "h100" + prefill_nodes: 2 + prefill_workers: 1 + decode_nodes: 2 + decode_workers: 1 + gpus_per_node: 8 + +frontend: + nginx_container: nginx-sqsh + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_SPEC_V2: "1" + + # Decode-specific environment variables + decode_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_SPEC_V2: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 16 + dp-size: 1 + ep-size: 1 + enable-dp-attention: false + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Prefill capacity + max-running-requests: 4 + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.6 + max-prefill-tokens: 2048 + chunked-prefill-size: 2048 + + # Request handling + load-balance-method: "round_robin" + + # MTP (Multi-Token Prediction) + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 16 + dp-size: 16 + ep-size: 16 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 1 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-running-requests: 64 + cuda-graph-max-bs: 64 + + # MTP + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1x2x4x8x16x32x64" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/mtp/h100-fp8-1p1d-max-tp-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/mtp/h100-fp8-1p1d-max-tp-mtp.yaml new file mode 100644 index 000000000..591556df7 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/mtp/h100-fp8-1p1d-max-tp-mtp.yaml @@ -0,0 +1,116 @@ +name: "h100-fp8-1p1d-max-tp-mtp" + +model: + path: "dsr1-fp8" + container: "lmsysorg/sglang:v0.5.8-cu130" + precision: "fp8" + +resources: + gpu_type: "h100" + prefill_nodes: 2 + prefill_workers: 1 + decode_nodes: 2 + decode_workers: 1 + gpus_per_node: 8 + +frontend: + nginx_container: nginx-sqsh + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_SPEC_V2: "1" + + # Decode-specific environment variables + decode_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_SPEC_V2: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 16 + dp-size: 1 + ep-size: 1 + enable-dp-attention: false + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Prefill capacity + max-running-requests: 2 + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.6 + max-prefill-tokens: 2048 + chunked-prefill-size: 2048 + + # Request handling + load-balance-method: "round_robin" + + # MTP (Multi-Token Prediction) + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 16 + dp-size: 1 + ep-size: 1 + enable-dp-attention: false + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 1 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.9 + max-running-requests: 128 + cuda-graph-max-bs: 128 + + # MTP (Multi-Token Prediction) + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1x2x4x8x16x32x64x128" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/stp/h100-fp8-1p1d-max-dep.yaml b/benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/stp/h100-fp8-1p1d-max-dep.yaml new file mode 100644 index 000000000..6c8a1c956 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/stp/h100-fp8-1p1d-max-dep.yaml @@ -0,0 +1,102 @@ +name: "h100-fp8-1p1d-max-dep" + +model: + path: "dsr1-fp8" + container: "lmsysorg/sglang:v0.5.8-cu130" + precision: "fp8" + +resources: + gpu_type: "h100" + prefill_nodes: 2 + prefill_workers: 1 + decode_nodes: 2 + decode_workers: 1 + gpus_per_node: 8 + +frontend: + nginx_container: nginx-sqsh + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + + # Decode-specific environment variables + decode_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 16 + dp-size: 1 + ep-size: 1 + enable-dp-attention: false + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Prefill capacity + max-running-requests: 4 + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.6 + max-prefill-tokens: 2048 + chunked-prefill-size: 2048 + + # Request handling + load-balance-method: "round_robin" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 16 + dp-size: 16 + ep-size: 16 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 1 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.9 + max-running-requests: 64 + cuda-graph-max-bs: 64 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1x2x4x8x16x32x64" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/stp/h100-fp8-1p1d-max-tp.yaml b/benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/stp/h100-fp8-1p1d-max-tp.yaml new file mode 100644 index 000000000..196e781df --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/stp/h100-fp8-1p1d-max-tp.yaml @@ -0,0 +1,102 @@ +name: "h100-fp8-1p1d-max-tp" + +model: + path: "dsr1-fp8" + container: "lmsysorg/sglang:v0.5.8-cu130" + precision: "fp8" + +resources: + gpu_type: "h100" + prefill_nodes: 2 + prefill_workers: 1 + decode_nodes: 2 + decode_workers: 1 + gpus_per_node: 8 + +frontend: + nginx_container: nginx-sqsh + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + + # Decode-specific environment variables + decode_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 16 + dp-size: 1 + ep-size: 1 + enable-dp-attention: false + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Prefill capacity + max-running-requests: 2 + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.6 + max-prefill-tokens: 2048 + chunked-prefill-size: 2048 + + # Request handling + load-balance-method: "round_robin" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 16 + dp-size: 1 + ep-size: 1 + enable-dp-attention: false + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 1 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.9 + max-running-requests: 128 + cuda-graph-max-bs: 128 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1x2x4x8x16x32x64x128" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-dep-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-dep-mtp.yaml new file mode 100644 index 000000000..2c6539c93 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-dep-mtp.yaml @@ -0,0 +1,121 @@ +name: "bs256-1p6d-h200-fp8-mtp" + +model: + path: "dsr1" + container: "lmsysorg/sglang:v0.5.8.post1-cu130" + precision: "fp8" + +frontend: + nginx_container: nginx + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 6 + decode_workers: 6 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_ENABLE_SPEC_V2: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + # Decode-specific environment variables + decode_environment: + SGLANG_ENABLE_SPEC_V2: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + # stream-interval: 50 + # used to be 512 + max-running-requests: 64 + + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + # usd to be 0.75 + mem-fraction-static: 0.82 + max-prefill-tokens: 65536 + # used to be 262144 + chunked-prefill-size: 65536 + + # Request handling + load-balance-method: "round_robin" + + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 10 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.75 + max-running-requests: 128 + cuda-graph-max-bs: 128 + + # MTP settings + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "128x256x512x1024x2048" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-dep.yaml b/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-dep.yaml new file mode 100644 index 000000000..1932dc222 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-dep.yaml @@ -0,0 +1,109 @@ +name: "bs256-1p6d-h200-fp8" + +model: + path: "dsr1" + container: "lmsysorg/sglang:v0.5.8.post1-cu130" + precision: "fp8" + +frontend: + nginx_container: nginx + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 6 + decode_workers: 6 + gpus_per_node: 8 + +backend: + + prefill_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + decode_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + # stream-interval: 50 + watchdog-timeout: 1000000 + max-running-requests: 512 + + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.75 + max-prefill-tokens: 65536 + chunked-prefill-size: 262144 + + # Request handling + load-balance-method: "round_robin" + + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 10 + watchdog-timeout: 1000000 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.82 + max-running-requests: 512 + cuda-graph-max-bs: 512 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "128x256x512x1024x2048" + req_rate: "inf" + diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-tp-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-tp-mtp.yaml new file mode 100644 index 000000000..f2fc08020 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-tp-mtp.yaml @@ -0,0 +1,118 @@ +name: "bs256-1p6d-h200-fp8-mtp" + +model: + path: "dsr1" + container: "lmsysorg/sglang:v0.5.8.post1-cu130" + precision: "fp8" + +frontend: + nginx_container: nginx + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 6 + decode_workers: 6 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_ENABLE_SPEC_V2: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + # Decode-specific environment variables + decode_environment: + SGLANG_ENABLE_SPEC_V2: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + # stream-interval: 50 + max-running-requests: 512 + + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.7 + max-prefill-tokens: 163840 + chunked-prefill-size: 163840 + + # Request handling + load-balance-method: "round_robin" + + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 10 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.75 + max-running-requests: 128 + cuda-graph-max-bs: 128 + + # MTP settings + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + # concurrencies: "128x256x512" + concurrencies: "512x1024x2048" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-tp.yaml b/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-tp.yaml new file mode 100644 index 000000000..05afea199 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-tp.yaml @@ -0,0 +1,109 @@ +name: "bs256-1p6d-h200-fp8" + +model: + path: "dsr1" + container: "lmsysorg/sglang:v0.5.8.post1-cu130" + precision: "fp8" + +frontend: + nginx_container: nginx + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 6 + decode_workers: 6 + gpus_per_node: 8 + +backend: + + prefill_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + decode_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + # stream-interval: 50 + watchdog-timeout: 1000000 + max-running-requests: 512 + + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.7 + max-prefill-tokens: 163840 + chunked-prefill-size: 163840 + + # Request handling + load-balance-method: "round_robin" + + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 10 + watchdog-timeout: 1000000 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.82 + max-running-requests: 512 + cuda-graph-max-bs: 512 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + # concurrencies: "128x256x512" + concurrencies: "512x1024x2048" + req_rate: "inf" + diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/low-latency-1p9d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/low-latency-1p9d-mtp.yaml new file mode 100644 index 000000000..5d6e66ebb --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/low-latency-1p9d-mtp.yaml @@ -0,0 +1,116 @@ +name: "low-latency-1p9d-h200-fp8-mtp" + +model: + path: "dsr1" + container: "lmsysorg/sglang:v0.5.8.post1-cu130" + precision: "fp8" + +frontend: + nginx_container: nginx + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 9 + decode_workers: 9 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_ENABLE_SPEC_V2: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + # Decode-specific environment variables + decode_environment: + SGLANG_ENABLE_SPEC_V2: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + # stream-interval: 50 + max-running-requests: 256 + + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.82 + max-prefill-tokens: 163840 + chunked-prefill-size: 163840 + + # Request handling + load-balance-method: "round_robin" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 10 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.75 + max-running-requests: 64 + cuda-graph-max-bs: 64 + + # MTP settings + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1x4x8x16x32x64x128x256" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/low-latency-1p9d.yaml b/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/low-latency-1p9d.yaml new file mode 100644 index 000000000..e60102aae --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/low-latency-1p9d.yaml @@ -0,0 +1,106 @@ +name: "low-latency-1p9d-h200-fp8" + +model: + path: "dsr1" + container: "lmsysorg/sglang:v0.5.8.post1-cu130" + precision: "fp8" + +frontend: + nginx_container: nginx + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 9 + decode_workers: 9 + gpus_per_node: 8 + +backend: + + prefill_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + decode_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + # stream-interval: 50 + watchdog-timeout: 1000000 + max-running-requests: 256 + + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.82 + max-prefill-tokens: 163840 + chunked-prefill-size: 163840 + + # Request handling + load-balance-method: "round_robin" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 10 + watchdog-timeout: 1000000 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.82 + max-running-requests: 256 + cuda-graph-max-bs: 256 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1x4x8x16x32x64x128x256" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs128-1p1d-dep-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs128-1p1d-dep-mtp.yaml new file mode 100644 index 000000000..4d62e5a04 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs128-1p1d-dep-mtp.yaml @@ -0,0 +1,118 @@ +name: "bs128-1p1d-dep-h200-fp8-mtp" + +model: + path: "dsr1" + container: "lmsysorg/sglang:v0.5.8.post1-cu130" + precision: "fp8" + +frontend: + nginx_container: nginx + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 1 + decode_workers: 1 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_ENABLE_SPEC_V2: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + # Decode-specific environment variables + decode_environment: + SGLANG_ENABLE_SPEC_V2: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + # stream-interval: 50 + max-running-requests: 16 + + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.75 + max-prefill-tokens: 163840 + chunked-prefill-size: 163840 + + # Request handling + load-balance-method: "round_robin" + + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 10 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.85 + max-running-requests: 192 + cuda-graph-max-bs: 192 + + # MTP settings + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "32x64x128x256x512" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs128-1p1d-dep.yaml b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs128-1p1d-dep.yaml new file mode 100644 index 000000000..d131f6b02 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs128-1p1d-dep.yaml @@ -0,0 +1,109 @@ +name: "bs128-1p1d-dep-h200-fp8" + +model: + path: "dsr1" + container: "lmsysorg/sglang:v0.5.8.post1-cu130" + precision: "fp8" + +frontend: + nginx_container: nginx + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 1 + decode_workers: 1 + gpus_per_node: 8 + +backend: + + prefill_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + decode_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + # stream-interval: 50 + watchdog-timeout: 1000000 + max-running-requests: 16 + + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.75 + max-prefill-tokens: 163840 + chunked-prefill-size: 163840 + + # Request handling + load-balance-method: "round_robin" + + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 10 + watchdog-timeout: 1000000 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.88 + max-running-requests: 256 + cuda-graph-max-bs: 256 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "64x128x256" + req_rate: "inf" + diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs16-1p3d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs16-1p3d-mtp.yaml new file mode 100644 index 000000000..97ea49b9a --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs16-1p3d-mtp.yaml @@ -0,0 +1,116 @@ +name: "bs16-1p3d-h200-fp8-mtp" + +model: + path: "dsr1" + container: "lmsysorg/sglang:v0.5.8.post1-cu130" + precision: "fp8" + +frontend: + nginx_container: nginx + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 3 + decode_workers: 3 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_ENABLE_SPEC_V2: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + # Decode-specific environment variables + decode_environment: + SGLANG_ENABLE_SPEC_V2: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + # stream-interval: 50 + max-running-requests: 16 + + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.82 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + + # Request handling + load-balance-method: "round_robin" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 10 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.82 + max-running-requests: 32 + cuda-graph-max-bs: 32 + + # MTP settings + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4x8x16x32x64" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs16-1p3d.yaml b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs16-1p3d.yaml new file mode 100644 index 000000000..576ff2a03 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs16-1p3d.yaml @@ -0,0 +1,107 @@ +name: "bs16-1p3d-h200-fp8" + +model: + path: "dsr1" + container: "lmsysorg/sglang:v0.5.8.post1-cu130" + precision: "fp8" + +frontend: + nginx_container: nginx + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 3 + decode_workers: 3 + gpus_per_node: 8 + +backend: + + prefill_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + decode_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + # stream-interval: 50 + watchdog-timeout: 1000000 + max-running-requests: 16 + + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.82 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + + # Request handling + load-balance-method: "round_robin" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 10 + watchdog-timeout: 1000000 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.82 + max-running-requests: 32 + cuda-graph-max-bs: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "8x16x32" + req_rate: "inf" + diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs4-1p7d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs4-1p7d-mtp.yaml new file mode 100644 index 000000000..d58d55b1b --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs4-1p7d-mtp.yaml @@ -0,0 +1,116 @@ +name: "bs4-1p7d-h200-fp8-mtp" + +model: + path: "dsr1" + container: "lmsysorg/sglang:v0.5.8.post1-cu130" + precision: "fp8" + +frontend: + nginx_container: nginx + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 7 + decode_workers: 7 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_ENABLE_SPEC_V2: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + # Decode-specific environment variables + decode_environment: + SGLANG_ENABLE_SPEC_V2: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + # stream-interval: 50 + max-running-requests: 16 + + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.82 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + + # Request handling + load-balance-method: "round_robin" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 10 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.75 + max-running-requests: 2 + cuda-graph-max-bs: 2 + + # MTP settings + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1x4x8" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs4-1p7d.yaml b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs4-1p7d.yaml new file mode 100644 index 000000000..78ce3d5a1 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs4-1p7d.yaml @@ -0,0 +1,107 @@ +name: "bs4-1p7d-h200-fp8" + +model: + path: "dsr1" + container: "lmsysorg/sglang:v0.5.8.post1-cu130" + precision: "fp8" + +frontend: + nginx_container: nginx + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 7 + decode_workers: 7 + gpus_per_node: 8 + +backend: + + prefill_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + decode_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + # stream-interval: 50 + watchdog-timeout: 1000000 + max-running-requests: 16 + + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.82 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + + # Request handling + load-balance-method: "round_robin" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 10 + watchdog-timeout: 1000000 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.82 + max-running-requests: 8 + cuda-graph-max-bs: 8 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1x4x8" + req_rate: "inf" + diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs64-2p3d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs64-2p3d-mtp.yaml new file mode 100644 index 000000000..ed1232d16 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs64-2p3d-mtp.yaml @@ -0,0 +1,125 @@ +name: "bs64-2p3d-h200-fp8-mtp" + +model: + path: "dsr1" + container: "lmsysorg/sglang:v0.5.8.post1-cu130" + precision: "fp8" + +frontend: + nginx_container: nginx + +resources: + gpu_type: "h200" + prefill_nodes: 2 + prefill_workers: 2 + decode_nodes: 3 + decode_workers: 3 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_ENABLE_SPEC_V2: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + # Decode-specific environment variables + decode_environment: + SGLANG_ENABLE_SPEC_V2: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + # stream-interval: 50 + max-running-requests: 16 + + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.82 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + + # Request handling + load-balance-method: "round_robin" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 10 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + context-length: 72000 + max-total-tokens: 128000 + # Memory and token limits + mem-fraction-static: 0.75 + max-running-requests: 16 + cuda-graph-max-bs: 16 + + # MTP settings + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "32x64x128" + req_rate: "inf" + +# benchmark: +# type: "gpqa" +# num_examples: 198 +# repeat: 4 +# num_threads: 32 +# max_tokens: 64000 diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs64-2p3d.yaml b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs64-2p3d.yaml new file mode 100644 index 000000000..73aaacc30 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs64-2p3d.yaml @@ -0,0 +1,115 @@ +name: "bs64-2p3d-h200-fp8" + +model: + path: "dsr1" + container: "lmsysorg/sglang:v0.5.8.post1-cu130" + precision: "fp8" + +frontend: + nginx_container: nginx + +resources: + gpu_type: "h200" + prefill_nodes: 2 + prefill_workers: 2 + decode_nodes: 3 + decode_workers: 3 + gpus_per_node: 8 + +backend: + + prefill_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + decode_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + # stream-interval: 50 + watchdog-timeout: 1000000 + max-running-requests: 16 + + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.82 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + + # Request handling + load-balance-method: "round_robin" + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 10 + watchdog-timeout: 1000000 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + #context-length: 72000 + # max-total-tokens: 128000 + # Memory and token limits + mem-fraction-static: 0.82 + max-running-requests: 128 + cuda-graph-max-bs: 128 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "32x64x128" + req_rate: "inf" + +# benchmark: +# type: "gpqa" +# num_examples: 198 +# repeat: 4 +# num_threads: 32 +# max_tokens: 64000 \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs8-1p6d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs8-1p6d-mtp.yaml new file mode 100644 index 000000000..5bd83fa5c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs8-1p6d-mtp.yaml @@ -0,0 +1,117 @@ +name: "bs8-1p6d-h200-fp8-mtp" + +model: + path: "dsr1" + container: "lmsysorg/sglang:v0.5.8.post1-cu130" + precision: "fp8" + +frontend: + nginx_container: nginx + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 6 + decode_workers: 6 + gpus_per_node: 8 + +backend: + + # Prefill-specific environment variables + prefill_environment: + SGLANG_ENABLE_SPEC_V2: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + # Decode-specific environment variables + decode_environment: + SGLANG_ENABLE_SPEC_V2: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + # stream-interval: 50 + max-running-requests: 16 + + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.82 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + + # Request handling + load-balance-method: "round_robin" + + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + watchdog-timeout: 1000000 + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 10 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.82 + max-running-requests: 16 + cuda-graph-max-bs: 16 + + # MTP settings + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2x4x8x16x32" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs8-1p6d.yaml b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs8-1p6d.yaml new file mode 100644 index 000000000..c37c50eea --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs8-1p6d.yaml @@ -0,0 +1,108 @@ +name: "bs8-1p6d-h200-fp8" + +model: + path: "dsr1" + container: "lmsysorg/sglang:v0.5.8.post1-cu130" + precision: "fp8" + +frontend: + nginx_container: nginx + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 6 + decode_workers: 6 + gpus_per_node: 8 + +backend: + + prefill_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + decode_environment: + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + prefill: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Radix cache disabled + disable-radix-cache: true + + # Other flags + # stream-interval: 50 + watchdog-timeout: 1000000 + max-running-requests: 16 + + + # Prefill-specific mode + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.82 + max-prefill-tokens: 32768 + chunked-prefill-size: 32768 + + # Request handling + load-balance-method: "round_robin" + + + decode: + # Model configuration + served-model-name: "deepseek-ai/DeepSeek-R1" + model-path: "/model/" + skip-tokenizer-init: true + trust-remote-code: true + + # Parallelism + tp-size: 8 + dp-size: 1 + ep-size: 1 + + # KV cache and attention + attention-backend: "flashinfer" + + # Other flags + disable-radix-cache: true + stream-interval: 10 + watchdog-timeout: 1000000 + + # Disagg + disaggregation-bootstrap-port: 30001 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.82 + max-running-requests: 16 + cuda-graph-max-bs: 16 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4x8x16" + req_rate: "inf" + diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml new file mode 100644 index 000000000..ce3eff436 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml @@ -0,0 +1,125 @@ +name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep16_batch32_eplb0_mtp0" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=32 +# STP (no speculative decoding) +# concurrency: 666 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + trust_remote_code: true + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "666" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml new file mode 100644 index 000000000..105b84bfd --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml @@ -0,0 +1,129 @@ +name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep32_batch64_eplb0_mtp0" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=64 +# STP (no speculative decoding) +# concurrency: 2253 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + trust_remote_code: true + max_batch_size: 64 + max_num_tokens: 64 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "2253" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: true + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml new file mode 100644 index 000000000..9fb194ddc --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml @@ -0,0 +1,217 @@ +name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 1 decode worker, TP8/EP8, enable_attention_dp=true, max_batch=768 +# STP (no speculative decoding) +# Covers all dep8 concurrencies: 4301, 6452 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 2 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + trust_remote_code: true + max_batch_size: 768 + max_num_tokens: 768 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + - 264 + - 272 + - 280 + - 288 + - 296 + - 304 + - 312 + - 320 + - 328 + - 336 + - 344 + - 352 + - 360 + - 368 + - 376 + - 384 + - 392 + - 400 + - 408 + - 416 + - 424 + - 432 + - 440 + - 448 + - 456 + - 464 + - 472 + - 480 + - 488 + - 496 + - 504 + - 512 + - 520 + - 528 + - 536 + - 544 + - 552 + - 560 + - 568 + - 576 + - 584 + - 592 + - 600 + - 608 + - 616 + - 624 + - 632 + - 640 + - 648 + - 656 + - 664 + - 672 + - 680 + - 688 + - 696 + - 704 + - 712 + - 720 + - 728 + - 736 + - 744 + - 752 + - 760 + - 768 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4301x6452" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml new file mode 100644 index 000000000..5639da411 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml @@ -0,0 +1,138 @@ +name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 4 decode workers, TP8/EP8, allreduce_strategy=MNNVL, max_batch=128 +# STP (no speculative decoding) +# Covers all gen4tep8 concurrencies: 4, 192, 360, 668 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 4 + decode_nodes: 8 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + allreduce_strategy: MNNVL + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + max_batch_size: 128 + max_num_tokens: 128 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4x192x360x668" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml new file mode 100644 index 000000000..f9496feb6 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml @@ -0,0 +1,122 @@ +name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 5 decode workers, TP4/EP4, max_batch=8 +# STP (no speculative decoding) +# Covers all gen5tep4 concurrencies: 5, 15, 30, 55 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 5 + decode_nodes: 5 + gpus_per_decode: 4 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + max_batch_size: 8 + max_num_tokens: 8 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "5x15x30x55" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml new file mode 100644 index 000000000..71b016c4b --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml @@ -0,0 +1,153 @@ +name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx2dep4_gen1dep16_batch256_eplb0_mtp0" + +# ctx: 2 prefill workers, TP4/EP4 +# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=256 +# STP (no speculative decoding) +# concurrency: 4301 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 2 + prefill_workers: 2 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + trust_remote_code: true + max_batch_size: 256 + max_num_tokens: 256 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4301" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml new file mode 100644 index 000000000..52b75bb4e --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml @@ -0,0 +1,137 @@ +name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx2dep4_gen1dep32_batch128_eplb0_mtp0" + +# ctx: 2 prefill workers, TP4/EP4 +# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=128 +# STP (no speculative decoding) +# concurrency: 4301 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + prefill_nodes: 2 + prefill_workers: 2 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + trust_remote_code: true + max_batch_size: 128 + max_num_tokens: 128 + max_seq_len: 2088 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4301" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml new file mode 100644 index 000000000..8c1f0aa82 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml @@ -0,0 +1,126 @@ +name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 4 decode workers, TP4/EP4, max_batch=32 +# Single concurrency point: 156 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + # Prefill: 1 worker x TP4 = 4 GPUs = 1 node + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + # Decode: 4 workers x TP4 = 16 GPUs = 4 nodes + decode_workers: 4 + decode_nodes: 4 + gpus_per_decode: 4 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "156" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml new file mode 100644 index 000000000..d4c5086b0 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml @@ -0,0 +1,123 @@ +name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 4 decode workers, TP8/EP8, allreduce_strategy=MNNVL, max_batch=1 +# Single concurrency point: 4 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + # Prefill: 1 worker x TP4 = 4 GPUs = 1 node + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + # Decode: 4 workers x TP8 = 32 GPUs = 8 nodes + decode_workers: 4 + decode_nodes: 8 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + allreduce_strategy: MNNVL + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + max_batch_size: 1 + max_num_tokens: 1 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml new file mode 100644 index 000000000..8f6ea063f --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml @@ -0,0 +1,126 @@ +name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0" + +# ctx: 1 prefill worker, TP4/EP4 +# gen: 5 decode workers, TP4/EP4, max_batch=16 +# Covers all concurrencies: 5, 15, 30, 60, 105 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + # Prefill: 1 worker x TP4 = 4 GPUs = 1 node + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + # Decode: 5 workers x TP4 = 20 GPUs = 5 nodes + decode_workers: 5 + decode_nodes: 5 + gpus_per_decode: 4 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + trust_remote_code: true + # max_batch_size=16 covers all concs: 5, 15, 30, 60, 105 + # cuda_graph pre-compiles graphs for each batch size up to the max + max_batch_size: 16 + max_num_tokens: 16 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "5x15x30x60x105" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml new file mode 100644 index 000000000..4bfaa0e2c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml @@ -0,0 +1,124 @@ +name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx2dep4_gen1dep16_batch16_eplb0_mtp0" + +# ctx: 2 prefill workers, TP4/EP4 +# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=16 +# concurrency: 333 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + # Prefill: 2 workers x TP4 = 8 GPUs = 2 nodes + prefill_nodes: 2 + prefill_workers: 2 + gpus_per_prefill: 4 + + # Decode: 1 worker x TP16 = 16 GPUs = 4 nodes + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + trust_remote_code: true + max_batch_size: 16 + max_num_tokens: 16 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "333" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml new file mode 100644 index 000000000..d7d51627c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml @@ -0,0 +1,126 @@ +name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx3dep4_gen1dep16_batch32_eplb0_mtp0" + +# ctx: 3 prefill workers, TP4/EP4 +# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=32 +# concurrency: 615 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + # Prefill: 3 workers x TP4 = 12 GPUs = 3 nodes + prefill_nodes: 3 + prefill_workers: 3 + gpus_per_prefill: 4 + + # Decode: 1 worker x TP16 = 16 GPUs = 4 nodes + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + trust_remote_code: true + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "615" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml new file mode 100644 index 000000000..e8df1179b --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml @@ -0,0 +1,155 @@ +name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0" + +# ctx: 5 prefill workers, TP4/EP4 +# gen: 1 decode worker, TP8/EP8, enable_attention_dp=true, max_batch=256 +# Single concurrency point: 2151 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + # Prefill: 5 workers x TP4 = 20 GPUs = 5 nodes + prefill_nodes: 5 + prefill_workers: 5 + gpus_per_prefill: 4 + + # Decode: 1 worker x TP8 = 8 GPUs = 2 nodes + decode_workers: 1 + decode_nodes: 2 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + trust_remote_code: true + # max_batch_size=256, cuda_graph pre-compiles graphs for all batch sizes up to 256 + max_batch_size: 256 + max_num_tokens: 256 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2151" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml new file mode 100644 index 000000000..db1778920 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml @@ -0,0 +1,138 @@ +name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx7dep4_gen1dep16_batch128_eplb0_mtp0" + +# ctx: 7 prefill workers, TP4/EP4 +# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=128 +# concurrency: 2253 + +model: + path: "nvidia/Kimi-K2.5-NVFP4" + container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + precision: "fp4" + +resources: + gpu_type: "gb200" + + # Prefill: 7 workers x TP4 = 28 GPUs = 7 nodes + prefill_nodes: 7 + prefill_workers: 7 + gpus_per_prefill: 4 + + # Decode: 1 worker x TP16 = 16 GPUs = 4 nodes + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + decode_environment: + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + + trtllm_config: + prefill: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + enable_attention_dp: true + disable_overlap_scheduler: true + trust_remote_code: true + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + print_iter_log: true + cuda_graph_config: null + moe_config: + backend: TRTLLM + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + trust_remote_code: true + max_batch_size: 128 + max_num_tokens: 128 + max_seq_len: 9256 + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2253" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml new file mode 100644 index 000000000..d4d9de835 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml @@ -0,0 +1,119 @@ +name: "ctx1_gen2_dep8_batch64_eplb0_mtp2" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 2 + decode_nodes: 2 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + max_batch_size: 8 + max_num_tokens: 10240 + max_seq_len: 1044 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 192 + max_seq_len: 2068 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 58 + - 60 + - 62 + - 64 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1214" + req_rate: "inf" + +frontend: + nginx_container: "nginx-sqsh" + type: "dynamo" + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml new file mode 100644 index 000000000..9532b9cc5 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml @@ -0,0 +1,117 @@ +name: "ctx1_gen5_dep8_batch16_eplb0_mtp3" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 5 + decode_nodes: 5 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + max_batch_size: 8 + max_num_tokens: 10240 + max_seq_len: 1044 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 64 + max_seq_len: 2068 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 13 + - 14 + - 15 + - 16 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "875" + req_rate: "inf" + +frontend: + nginx_container: "nginx-sqsh" + type: "dynamo" + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml new file mode 100644 index 000000000..31bf5bf20 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml @@ -0,0 +1,112 @@ +name: "ctx1_gen5_tep8_batch1_eplb0_mtp3" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 5 + decode_nodes: 5 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + max_batch_size: 8 + max_num_tokens: 10240 + max_seq_len: 1044 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 1 + max_num_tokens: 4 + max_seq_len: 2068 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "6" + req_rate: "inf" + +frontend: + nginx_container: "nginx-sqsh" + type: "dynamo" + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml new file mode 100644 index 000000000..3a3309f56 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml @@ -0,0 +1,126 @@ +name: "ctx1_gen5_tep8_batch32_eplb0_mtp3" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 5 + decode_nodes: 5 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + max_batch_size: 8 + max_num_tokens: 10240 + max_seq_len: 1044 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2068 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 3 + - 4 + - 5 + - 8 + - 9 + - 10 + - 16 + - 17 + - 18 + - 29 + - 30 + - 31 + - 32 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "10x15x25x45x90x180" + req_rate: "inf" + +frontend: + nginx_container: "nginx-sqsh" + type: "dynamo" + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml new file mode 100644 index 000000000..90ad2c657 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml @@ -0,0 +1,120 @@ +name: "ctx3_gen4_dep8_batch128_eplb0_mtp1" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 2 + prefill_workers: 3 + gpus_per_prefill: 4 + + decode_workers: 4 + decode_nodes: 4 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + max_batch_size: 8 + max_num_tokens: 10240 + max_seq_len: 1044 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 128 + max_num_tokens: 256 + max_seq_len: 2068 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 122 + - 124 + - 126 + - 128 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4968" + req_rate: "inf" + +frontend: + nginx_container: "nginx-sqsh" + type: "dynamo" + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml new file mode 100644 index 000000000..31adc6239 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml @@ -0,0 +1,126 @@ +name: "ctx3_gen5_dep4_batch512_eplb0_mtp1" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 2 + prefill_workers: 3 + gpus_per_prefill: 4 + + decode_workers: 5 + decode_nodes: 3 + gpus_per_decode: 4 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + max_batch_size: 8 + max_num_tokens: 10240 + max_seq_len: 1044 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + + decode: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 512 + max_num_tokens: 1024 + max_seq_len: 2068 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 192 + - 256 + - 384 + - 448 + - 506 + - 508 + - 510 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "10860" + req_rate: "inf" + +frontend: + nginx_container: "nginx-sqsh" + type: "dynamo" + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml new file mode 100644 index 000000000..6c3e4bf80 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml @@ -0,0 +1,117 @@ +name: "ctx1_gen1_dep8_batch512_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 1 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + max_batch_size: 8 + max_num_tokens: 10240 + max_seq_len: 1044 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 512 + max_num_tokens: 512 + max_seq_len: 2068 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 384 + - 448 + - 508 + - 510 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" + +frontend: + nginx_container: "nginx-sqsh" + type: "dynamo" + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml new file mode 100644 index 000000000..56746330e --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml @@ -0,0 +1,114 @@ +name: "ctx1_gen2_dep8_batch128_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 2 + decode_nodes: 2 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + max_batch_size: 8 + max_num_tokens: 10240 + max_seq_len: 1044 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 128 + max_num_tokens: 128 + max_seq_len: 2068 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 122 + - 124 + - 126 + - 128 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "2192" + req_rate: "inf" + +frontend: + nginx_container: "nginx-sqsh" + type: "dynamo" + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml new file mode 100644 index 000000000..0fde29f21 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml @@ -0,0 +1,112 @@ +name: "ctx1_gen5_dep8_batch32_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 5 + decode_nodes: 5 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + max_batch_size: 8 + max_num_tokens: 10240 + max_seq_len: 1044 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 2068 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 26 + - 28 + - 30 + - 32 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1365" + req_rate: "inf" + +frontend: + nginx_container: "nginx-sqsh" + type: "dynamo" + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml new file mode 100644 index 000000000..4612b7c2c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml @@ -0,0 +1,106 @@ +name: "ctx1_gen5_tep8_batch1_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 5 + decode_nodes: 5 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + max_batch_size: 8 + max_num_tokens: 10240 + max_seq_len: 1044 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 1 + max_num_tokens: 1 + max_seq_len: 2068 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "6" + req_rate: "inf" + +frontend: + nginx_container: "nginx-sqsh" + type: "dynamo" + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml new file mode 100644 index 000000000..53e833b75 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml @@ -0,0 +1,127 @@ +name: "ctx1_gen5_tep8_batch32_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 5 + decode_nodes: 5 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + max_batch_size: 8 + max_num_tokens: 10240 + max_seq_len: 1044 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 2068 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 3 + - 4 + - 5 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + - 16 + - 18 + - 20 + - 22 + - 24 + - 26 + - 28 + - 30 + - 32 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "10x15x25x45x90x180" + req_rate: "inf" + +frontend: + nginx_container: "nginx-sqsh" + type: "dynamo" + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml new file mode 100644 index 000000000..47c2c6e22 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml @@ -0,0 +1,116 @@ +name: "ctx1_gen6_tep8_batch64_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 6 + decode_nodes: 6 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + max_batch_size: 8 + max_num_tokens: 10240 + max_seq_len: 1044 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 64 + max_seq_len: 2068 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 56 + - 58 + - 60 + - 62 + - 64 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "450" + req_rate: "inf" + +frontend: + nginx_container: "nginx-sqsh" + type: "dynamo" + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml new file mode 100644 index 000000000..a1ec4f38d --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml @@ -0,0 +1,116 @@ +name: "ctx1_gen1_dep8_batch8_eplb0_mtp3" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 1 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16896 + max_seq_len: 8232 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 8 + max_num_tokens: 32 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 5 + - 6 + - 7 + - 8 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "90" + req_rate: "inf" + +frontend: + nginx_container: "nginx-sqsh" + type: "dynamo" + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml new file mode 100644 index 000000000..48aad03b6 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml @@ -0,0 +1,123 @@ +name: "ctx1_gen3_tep8_batch16_eplb0_mtp3" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 3 + decode_nodes: 3 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16896 + max_seq_len: 8232 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 64 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + - 16 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "66" + req_rate: "inf" + +frontend: + nginx_container: "nginx-sqsh" + type: "dynamo" + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml new file mode 100644 index 000000000..559841f73 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml @@ -0,0 +1,112 @@ +name: "ctx1_gen5_tep8_batch1_eplb0_mtp3" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 5 + decode_nodes: 5 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16896 + max_seq_len: 8232 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 1 + max_num_tokens: 4 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "6" + req_rate: "inf" + +frontend: + nginx_container: "nginx-sqsh" + type: "dynamo" + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml new file mode 100644 index 000000000..f9d9843f6 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml @@ -0,0 +1,119 @@ +name: "ctx1_gen5_tep8_batch8_eplb0_mtp3" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 5 + decode_nodes: 5 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16896 + max_seq_len: 8232 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 8 + max_num_tokens: 32 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "10x15x30x60" + req_rate: "inf" + +frontend: + nginx_container: "nginx-sqsh" + type: "dynamo" + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml new file mode 100644 index 000000000..7e06d12b5 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml @@ -0,0 +1,120 @@ +name: "ctx3_gen1_dep8_batch64_eplb0_mtp3" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 2 + prefill_workers: 3 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 1 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16896 + max_seq_len: 8232 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 256 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 48 + - 56 + - 60 + - 62 + - 64 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "548" + req_rate: "inf" + +frontend: + nginx_container: "nginx-sqsh" + type: "dynamo" + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml new file mode 100644 index 000000000..96b4d97c5 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml @@ -0,0 +1,124 @@ +name: "ctx5_gen1_dep8_batch192_eplb0_mtp1" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 3 + prefill_workers: 5 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 1 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16896 + max_seq_len: 8232 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 192 + max_num_tokens: 384 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 130 + - 132 + - 134 + - 136 + - 138 + - 168 + - 192 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1096x1691" + req_rate: "inf" + +frontend: + nginx_container: "nginx-sqsh" + type: "dynamo" + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml new file mode 100644 index 000000000..98229c7bf --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml @@ -0,0 +1,119 @@ +name: "ctx5_gen2_dep8_batch32_eplb0_mtp3" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 3 + prefill_workers: 5 + gpus_per_prefill: 4 + + decode_workers: 2 + decode_nodes: 2 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16896 + max_seq_len: 8232 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 20 + - 24 + - 28 + - 30 + - 32 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "658" + req_rate: "inf" + +frontend: + nginx_container: "nginx-sqsh" + type: "dynamo" + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml new file mode 100644 index 000000000..762987f6e --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml @@ -0,0 +1,107 @@ +name: "ctx1_gen5_tep8_batch1_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 5 + decode_nodes: 5 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16896 + max_seq_len: 8232 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 1 + max_num_tokens: 1 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "6" + req_rate: "inf" + +frontend: + nginx_container: "nginx-sqsh" + type: "dynamo" + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml new file mode 100644 index 000000000..a03114f95 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml @@ -0,0 +1,120 @@ +name: "ctx1_gen5_tep8_batch8_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 5 + decode_nodes: 5 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16896 + max_seq_len: 8232 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 16 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 12 + - 13 + - 14 + - 15 + - 16 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "10x15x25x50x100" + req_rate: "inf" + +frontend: + nginx_container: "nginx-sqsh" + type: "dynamo" + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml new file mode 100644 index 000000000..4dfe07604 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml @@ -0,0 +1,115 @@ +name: "ctx2_gen5_tep8_batch64_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 2 + gpus_per_prefill: 4 + + decode_workers: 5 + decode_nodes: 5 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16896 + max_seq_len: 8232 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 64 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 58 + - 60 + - 62 + - 64 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "370" + req_rate: "inf" + +frontend: + nginx_container: "nginx-sqsh" + type: "dynamo" + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml new file mode 100644 index 000000000..23c2db5d8 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml @@ -0,0 +1,118 @@ +name: "ctx4_gen1_dep8_batch192_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 2 + prefill_workers: 4 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 1 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16896 + max_seq_len: 8232 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 192 + max_num_tokens: 192 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 152 + - 160 + - 168 + - 176 + - 184 + - 190 + - 192 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1606" + req_rate: "inf" + +frontend: + nginx_container: "nginx-sqsh" + type: "dynamo" + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml new file mode 100644 index 000000000..e94326803 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml @@ -0,0 +1,111 @@ +name: "ctx4_gen3_dep8_batch32_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 2 + prefill_workers: 4 + gpus_per_prefill: 4 + + decode_workers: 3 + decode_nodes: 3 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16896 + max_seq_len: 8232 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 28 + - 30 + - 32 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "837" + req_rate: "inf" + +frontend: + nginx_container: "nginx-sqsh" + type: "dynamo" + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml new file mode 100644 index 000000000..b3c9e1300 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml @@ -0,0 +1,114 @@ +name: "ctx7_gen2_dep8_batch128_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b200" + prefill_nodes: 4 + prefill_workers: 7 + gpus_per_prefill: 4 + + decode_workers: 2 + decode_nodes: 2 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16896 + max_seq_len: 8232 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 128 + max_num_tokens: 128 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 116 + - 120 + - 124 + - 128 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2222" + req_rate: "inf" + +frontend: + nginx_container: "nginx-sqsh" + type: "dynamo" + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml new file mode 100644 index 000000000..8c7cf706d --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml @@ -0,0 +1,121 @@ +name: ctx1_gen2_dep8_batch768_eplb0_mtp2_1600 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 2 + decode_nodes: 2 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1152 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1152 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1152 + cuda_graph_config: + enable_padding: true + max_batch_size: 768 + disable_overlap_scheduler: false + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 768 + max_num_tokens: 2304 + max_seq_len: 2176 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: [1600] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml new file mode 100644 index 000000000..dd06e8462 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml @@ -0,0 +1,121 @@ +name: ctx1_gen3_dep8_batch384_eplb0_mtp3_1184 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 3 + decode_nodes: 3 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1152 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1152 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1152 + cuda_graph_config: + enable_padding: true + max_batch_size: 384 + disable_overlap_scheduler: false + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 384 + max_num_tokens: 1536 + max_seq_len: 2176 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: [1184] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml new file mode 100644 index 000000000..d41d81458 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml @@ -0,0 +1,121 @@ +name: ctx1_gen4_dep8_batch256_eplb0_mtp3_1024 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 4 + decode_nodes: 4 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1152 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1152 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1152 + cuda_graph_config: + enable_padding: true + max_batch_size: 256 + disable_overlap_scheduler: false + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 256 + max_num_tokens: 1024 + max_seq_len: 2176 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: [1024] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml new file mode 100644 index 000000000..3b4193e44 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml @@ -0,0 +1,121 @@ +name: ctx1_gen7_dep8_batch128_eplb0_mtp3_896 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 7 + decode_nodes: 7 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1152 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1152 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1152 + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + disable_overlap_scheduler: false + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 128 + max_num_tokens: 512 + max_seq_len: 2176 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: [896] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml new file mode 100644 index 000000000..de08fe729 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml @@ -0,0 +1,121 @@ +name: ctx1_gen8_tp8_batch1_eplb0_mtp3_8 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 8 + decode_nodes: 8 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1152 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1152 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1152 + cuda_graph_config: + enable_padding: true + max_batch_size: 1 + disable_overlap_scheduler: false + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 1 + max_num_tokens: 4 + max_seq_len: 2176 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: [8] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml new file mode 100644 index 000000000..0b67948c3 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml @@ -0,0 +1,121 @@ +name: ctx1_gen8_tp8_batch32_eplb0_mtp3_256 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 8 + decode_nodes: 8 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1152 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1152 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1152 + cuda_graph_config: + enable_padding: true + max_batch_size: 32 + disable_overlap_scheduler: false + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 32 + max_num_tokens: 256 + max_seq_len: 2176 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: [256] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml new file mode 100644 index 000000000..a79351e20 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml @@ -0,0 +1,121 @@ +name: ctx1_gen8_tp8_batch4_eplb0_mtp3_32 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 8 + decode_nodes: 8 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1152 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1152 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1152 + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + disable_overlap_scheduler: false + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 4 + max_num_tokens: 256 + max_seq_len: 2176 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: [32] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml new file mode 100644 index 000000000..1814ff355 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml @@ -0,0 +1,121 @@ +name: ctx1_gen8_tp8_batch8_eplb0_mtp3_64 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 8 + decode_nodes: 8 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1152 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1152 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1152 + cuda_graph_config: + enable_padding: true + max_batch_size: 8 + disable_overlap_scheduler: false + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 8 + max_num_tokens: 256 + max_seq_len: 2176 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: [64] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml new file mode 100644 index 000000000..2e0ac949f --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml @@ -0,0 +1,115 @@ +name: ctx1_gen1_dep8_batch512_eplb0_mtp0_4096 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 1 + decode_nodes: 1 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1152 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.2 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1152 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1152 + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + disable_overlap_scheduler: false + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 512 + max_num_tokens: 4096 + max_seq_len: 2176 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 40 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: [4096] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml new file mode 100644 index 000000000..47008c9f0 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml @@ -0,0 +1,115 @@ +name: ctx1_gen3_tp8_batch1024_eplb0_mtp0_128 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 3 + decode_nodes: 3 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 1152 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1152 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 1152 + cuda_graph_config: + enable_padding: true + max_batch_size: 1024 + disable_overlap_scheduler: false + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 1024 + max_num_tokens: 4096 + max_seq_len: 2176 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: [128] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml new file mode 100644 index 000000000..aa2d8c6f2 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml @@ -0,0 +1,115 @@ +name: ctx1_gen3_tp8_batch1024_eplb0_mtp0_32 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 3 + decode_nodes: 3 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 1152 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1152 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 1152 + cuda_graph_config: + enable_padding: true + max_batch_size: 12 + disable_overlap_scheduler: false + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 12 + max_num_tokens: 12 + max_seq_len: 2176 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: [32] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml new file mode 100644 index 000000000..b9829e22f --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml @@ -0,0 +1,115 @@ +name: ctx1_gen3_tp8_batch1024_eplb0_mtp0_4 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 3 + decode_nodes: 3 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 1152 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1152 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 1152 + cuda_graph_config: + enable_padding: true + max_batch_size: 1 + disable_overlap_scheduler: false + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 1 + max_num_tokens: 1 + max_seq_len: 2176 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: [4] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml new file mode 100644 index 000000000..56df5bad2 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml @@ -0,0 +1,115 @@ +name: ctx1_gen5_dep8_batch48_eplb0_mtp0_1920 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 5 + decode_nodes: 5 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 1152 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1152 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 1152 + cuda_graph_config: + enable_padding: true + max_batch_size: 48 + disable_overlap_scheduler: false + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 48 + max_num_tokens: 4096 + max_seq_len: 2176 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: [1920] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml new file mode 100644 index 000000000..a412a6419 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml @@ -0,0 +1,115 @@ +name: ctx2_gen5_dep8_batch128_eplb0_mtp0_5152 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b200" + prefill_nodes: 2 + prefill_workers: 2 + gpus_per_prefill: 8 + + decode_workers: 5 + decode_nodes: 5 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 1152 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1152 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 1152 + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + disable_overlap_scheduler: false + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 128 + max_num_tokens: 4096 + max_seq_len: 2176 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: [5152] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml new file mode 100644 index 000000000..2ccfffba7 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml @@ -0,0 +1,123 @@ +name: ctx1_gen2_tp8_batch32_eplb0_mtp3_8 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 2 + decode_nodes: 2 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 8320 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.2 + max_batch_size: 1 + max_num_tokens: 8320 + max_seq_len: 8320 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: AUTO + attention_dp_config: + enable_balance: true + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + disable_overlap_scheduler: false + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 4 + max_num_tokens: 16 + max_seq_len: 9344 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: [8] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml new file mode 100644 index 000000000..a9ad0a7d9 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml @@ -0,0 +1,123 @@ +name: ctx1_gen4_tp8_batch16_eplb0_mtp3_64 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 4 + decode_nodes: 4 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 8320 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.2 + max_batch_size: 1 + max_num_tokens: 8320 + max_seq_len: 8320 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: AUTO + attention_dp_config: + enable_balance: true + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: true + max_batch_size: 16 + disable_overlap_scheduler: false + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 16 + max_num_tokens: 64 + max_seq_len: 9344 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: [64] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml new file mode 100644 index 000000000..38b12e6c0 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml @@ -0,0 +1,123 @@ +name: ctx1_gen6_tp8_batch8_eplb0_mtp3_48 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 6 + decode_nodes: 6 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 8320 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.2 + max_batch_size: 1 + max_num_tokens: 8320 + max_seq_len: 8320 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: AUTO + attention_dp_config: + enable_balance: true + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: true + max_batch_size: 8 + disable_overlap_scheduler: false + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 8 + max_num_tokens: 32 + max_seq_len: 9344 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: [48] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml new file mode 100644 index 000000000..3b38311b7 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml @@ -0,0 +1,123 @@ +name: ctx1_gen6_tp8_batch8_eplb0_mtp3_8 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 6 + decode_nodes: 6 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 8320 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.2 + max_batch_size: 1 + max_num_tokens: 8320 + max_seq_len: 8320 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: AUTO + attention_dp_config: + enable_balance: true + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: true + max_batch_size: 1 + disable_overlap_scheduler: false + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 1 + max_num_tokens: 4 + max_seq_len: 9344 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: [8] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml new file mode 100644 index 000000000..378123831 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml @@ -0,0 +1,125 @@ +name: ctx2_gen1_dep8_batch32_eplb0_mtp3_288 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b200" + prefill_nodes: 2 + prefill_workers: 2 + gpus_per_prefill: 8 + + decode_workers: 1 + decode_nodes: 1 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 8320 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.2 + max_batch_size: 1 + max_num_tokens: 8320 + max_seq_len: 8320 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: AUTO + attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: true + max_batch_size: 32 + disable_overlap_scheduler: false + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 32 + max_num_tokens: 1024 + max_seq_len: 9344 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: [288] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml new file mode 100644 index 000000000..a26eaf4f1 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml @@ -0,0 +1,125 @@ +name: ctx2_gen3_dep8_batch8_eplb0_mtp3_224 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b200" + prefill_nodes: 2 + prefill_workers: 2 + gpus_per_prefill: 8 + + decode_workers: 3 + decode_nodes: 3 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 8320 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.2 + max_batch_size: 1 + max_num_tokens: 8320 + max_seq_len: 8320 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: AUTO + attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: true + max_batch_size: 8 + disable_overlap_scheduler: false + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 8 + max_num_tokens: 256 + max_seq_len: 9344 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: [224] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml new file mode 100644 index 000000000..3c659d4dc --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml @@ -0,0 +1,125 @@ +name: ctx4_gen1_dep8_batch128_eplb0_mtp2_1088 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b200" + prefill_nodes: 4 + prefill_workers: 4 + gpus_per_prefill: 8 + + decode_workers: 1 + decode_nodes: 1 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8320 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + max_batch_size: 1 + max_num_tokens: 8320 + max_seq_len: 8320 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: AUTO + attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + disable_overlap_scheduler: false + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 128 + max_num_tokens: 3072 + max_seq_len: 9344 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: [1088] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml new file mode 100644 index 000000000..6c383e60e --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml @@ -0,0 +1,115 @@ +name: ctx1_gen1_dep8_batch128_eplb0_mtp0_128 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 1 + decode_nodes: 1 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: false + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.2 + max_batch_size: 1 + max_num_tokens: 8320 + max_seq_len: 8320 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + disable_overlap_scheduler: false + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 128 + max_num_tokens: 512 + max_seq_len: 9344 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: [128] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml new file mode 100644 index 000000000..7821ab79e --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml @@ -0,0 +1,115 @@ +name: ctx1_gen1_dep8_batch256_eplb0_mtp0_256 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 1 + decode_nodes: 1 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: false + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.2 + max_batch_size: 1 + max_num_tokens: 8320 + max_seq_len: 8320 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: true + max_batch_size: 256 + disable_overlap_scheduler: false + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 256 + max_num_tokens: 512 + max_seq_len: 9344 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: [256] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml new file mode 100644 index 000000000..0f2fdd949 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml @@ -0,0 +1,117 @@ +name: ctx1_gen1_tp8_batch1_eplb0_mtp0_1 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 1 + decode_nodes: 1 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + disable_overlap_scheduler: true + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 64 + max_num_tokens: 8320 + max_seq_len: 8320 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: true + max_batch_size: 1 + disable_overlap_scheduler: false + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 1 + max_num_tokens: 1 + max_seq_len: 9344 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: [1] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml new file mode 100644 index 000000000..305c15124 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml @@ -0,0 +1,115 @@ +name: ctx1_gen2_dep8_batch64_eplb0_mtp0_128 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 2 + decode_nodes: 2 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: false + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.2 + max_batch_size: 1 + max_num_tokens: 8320 + max_seq_len: 8320 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + disable_overlap_scheduler: false + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 64 + max_num_tokens: 512 + max_seq_len: 9344 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: [128] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml new file mode 100644 index 000000000..3c64aacf5 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml @@ -0,0 +1,116 @@ +name: ctx1_gen4_tp8_batch32_eplb0_mtp0_128 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 4 + decode_nodes: 4 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: false + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.2 + max_batch_size: 1 + max_num_tokens: 8320 + max_seq_len: 8320 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: true + max_batch_size: 32 + disable_overlap_scheduler: false + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 32 + max_num_tokens: 512 + max_seq_len: 9344 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: [128] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml new file mode 100644 index 000000000..751bdd585 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml @@ -0,0 +1,116 @@ +name: ctx1_gen4_tp8_batch32_eplb0_mtp0_32 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 4 + decode_nodes: 4 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: false + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.2 + max_batch_size: 1 + max_num_tokens: 8320 + max_seq_len: 8320 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: true + max_batch_size: 8 + disable_overlap_scheduler: false + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 8 + max_num_tokens: 8 + max_seq_len: 9344 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: [32] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml new file mode 100644 index 000000000..cb4c4d8a3 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml @@ -0,0 +1,116 @@ +name: ctx1_gen6_tp8_batch16_eplb0_mtp0_96 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b200" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 6 + decode_nodes: 6 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: false + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.2 + max_batch_size: 1 + max_num_tokens: 8320 + max_seq_len: 8320 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: true + max_batch_size: 16 + disable_overlap_scheduler: false + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 16 + max_num_tokens: 512 + max_seq_len: 9344 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: [96] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml new file mode 100644 index 000000000..db804a6b6 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml @@ -0,0 +1,115 @@ +name: ctx2_gen1_dep8_batch640_eplb0_mtp0_640 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b200" + prefill_nodes: 2 + prefill_workers: 2 + gpus_per_prefill: 8 + + decode_workers: 1 + decode_nodes: 1 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_RNDV_SCHEME: "put_zcopy" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: false + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.2 + max_batch_size: 1 + max_num_tokens: 8320 + max_seq_len: 8320 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: true + max_batch_size: 640 + disable_overlap_scheduler: false + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 640 + max_num_tokens: 512 + max_seq_len: 9344 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: [640] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml new file mode 100644 index 000000000..36b365a7d --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml @@ -0,0 +1,127 @@ +name: "ctx1_gen1_dep8_batch64_eplb0_mtp3" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 1 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + + trtllm_config: + prefill: + max_batch_size: 8 + max_num_tokens: 10240 + max_seq_len: 1044 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 256 + max_seq_len: 2068 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "654" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml new file mode 100644 index 000000000..f2cd900c9 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml @@ -0,0 +1,125 @@ +name: "ctx1_gen2_dep8_batch16_eplb0_mtp3" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 2 + decode_nodes: 2 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + + trtllm_config: + prefill: + max_batch_size: 8 + max_num_tokens: 10240 + max_seq_len: 1044 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 64 + max_seq_len: 2068 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "271" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml new file mode 100644 index 000000000..31bae1596 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml @@ -0,0 +1,123 @@ +name: "ctx1_gen5_tep8_batch1_eplb0_mtp3" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 5 + decode_nodes: 5 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + + trtllm_config: + prefill: + max_batch_size: 8 + max_num_tokens: 10240 + max_seq_len: 1044 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 1 + max_num_tokens: 4 + max_seq_len: 2068 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "11" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml new file mode 100644 index 000000000..eeb43290a --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml @@ -0,0 +1,139 @@ +name: "ctx1_gen5_tep8_batch32_eplb0_mtp3" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 5 + decode_nodes: 5 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + + trtllm_config: + prefill: + max_batch_size: 8 + max_num_tokens: 10240 + max_seq_len: 1044 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2068 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 3 + - 4 + - 5 + - 8 + - 10 + - 11 + - 12 + - 16 + - 18 + - 20 + - 22 + - 23 + - 24 + - 28 + - 32 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "10x20x25x60x120x200" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml new file mode 100644 index 000000000..7f8b9ae4a --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml @@ -0,0 +1,129 @@ +name: "ctx2_gen1_dep8_batch256_eplb0_mtp1" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 2 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 1 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + + trtllm_config: + prefill: + max_batch_size: 8 + max_num_tokens: 10240 + max_seq_len: 1044 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 256 + max_num_tokens: 512 + max_seq_len: 2068 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "2342" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml new file mode 100644 index 000000000..98d8ab04d --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml @@ -0,0 +1,130 @@ +name: "ctx5_gen2_dep8_batch512_eplb0_mtp1" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 2 + prefill_workers: 5 + gpus_per_prefill: 2 + + decode_workers: 2 + decode_nodes: 2 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + + trtllm_config: + prefill: + max_batch_size: 8 + max_num_tokens: 10240 + max_seq_len: 1044 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 512 + max_num_tokens: 1024 + max_seq_len: 2068 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "8609" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml new file mode 100644 index 000000000..a81e980ec --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml @@ -0,0 +1,131 @@ +name: "ctx5_gen2_dep8_batch768_eplb0_mtp1" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 2 + prefill_workers: 5 + gpus_per_prefill: 2 + + decode_workers: 2 + decode_nodes: 2 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + + trtllm_config: + prefill: + max_batch_size: 8 + max_num_tokens: 10240 + max_seq_len: 1044 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 768 + max_num_tokens: 1536 + max_seq_len: 2068 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "12926" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml new file mode 100644 index 000000000..13978a422 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml @@ -0,0 +1,121 @@ +name: "ctx1_gen2_dep8_batch64_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 2 + decode_nodes: 2 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + + trtllm_config: + prefill: + max_batch_size: 8 + max_num_tokens: 10240 + max_seq_len: 1044 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 64 + max_seq_len: 2068 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1176" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml new file mode 100644 index 000000000..5885277d0 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml @@ -0,0 +1,117 @@ +name: "ctx1_gen4_tep8_batch1_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 4 + decode_nodes: 4 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + + trtllm_config: + prefill: + max_batch_size: 8 + max_num_tokens: 10240 + max_seq_len: 1044 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 1 + max_num_tokens: 1 + max_seq_len: 2068 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "6" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml new file mode 100644 index 000000000..9d73c7308 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml @@ -0,0 +1,121 @@ +name: "ctx1_gen5_tep4_batch4_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 5 + decode_nodes: 3 + gpus_per_decode: 4 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + + trtllm_config: + prefill: + max_batch_size: 8 + max_num_tokens: 10240 + max_seq_len: 1044 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + + decode: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 4 + max_num_tokens: 4 + max_seq_len: 2068 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 3 + - 4 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "5x10x15x25" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml new file mode 100644 index 000000000..92b99de35 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml @@ -0,0 +1,136 @@ +name: "ctx1_gen5_tep8_batch64_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 5 + decode_nodes: 5 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + + trtllm_config: + prefill: + max_batch_size: 8 + max_num_tokens: 10240 + max_seq_len: 1044 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 64 + max_seq_len: 2068 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 3 + - 4 + - 5 + - 8 + - 10 + - 11 + - 12 + - 16 + - 18 + - 20 + - 22 + - 27 + - 32 + - 35 + - 39 + - 48 + - 56 + - 64 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "60x110x195x395" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml new file mode 100644 index 000000000..3113744c9 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml @@ -0,0 +1,124 @@ +name: "ctx2_gen1_dep8_batch512_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 2 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 1 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + + trtllm_config: + prefill: + max_batch_size: 8 + max_num_tokens: 10240 + max_seq_len: 1044 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 512 + max_num_tokens: 512 + max_seq_len: 2068 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4405" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml new file mode 100644 index 000000000..d74782639 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml @@ -0,0 +1,129 @@ +name: "ctx3_gen1_dep8_batch1024_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 3 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 1 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + + trtllm_config: + prefill: + max_batch_size: 8 + max_num_tokens: 10240 + max_seq_len: 1044 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 1024 + max_num_tokens: 1024 + max_seq_len: 2068 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 832 + - 896 + - 960 + - 1024 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "8192" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml new file mode 100644 index 000000000..5088b566c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml @@ -0,0 +1,123 @@ +name: "ctx3_gen2_dep8_batch256_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 3 + gpus_per_prefill: 2 + + decode_workers: 2 + decode_nodes: 2 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + + trtllm_config: + prefill: + max_batch_size: 8 + max_num_tokens: 10240 + max_seq_len: 1044 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 256 + max_num_tokens: 256 + max_seq_len: 2068 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4611" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml new file mode 100644 index 000000000..c24f57918 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml @@ -0,0 +1,129 @@ +name: "ctx10_gen1_dep8_batch256_eplb0_mtp1" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 3 + prefill_workers: 10 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 1 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16896 + max_seq_len: 8232 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 256 + max_num_tokens: 512 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2198" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml new file mode 100644 index 000000000..7e2ab395a --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml @@ -0,0 +1,127 @@ +name: "ctx1_gen4_tep4_batch8_eplb0_mtp3" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 4 + decode_nodes: 2 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16896 + max_seq_len: 8232 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 8 + max_num_tokens: 32 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + gpus_per_node: 4 + allreduce_strategy: MNNVL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "52" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml new file mode 100644 index 000000000..83c7af6ad --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml @@ -0,0 +1,123 @@ +name: "ctx1_gen4_tep8_batch1_eplb0_mtp3" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 4 + decode_nodes: 4 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16896 + max_seq_len: 8232 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 1 + max_num_tokens: 4 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "8" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml new file mode 100644 index 000000000..723029b8d --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml @@ -0,0 +1,126 @@ +name: "ctx1_gen4_tep8_batch4_eplb0_mtp3" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 4 + decode_nodes: 4 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16896 + max_seq_len: 8232 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 4 + max_num_tokens: 16 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 3 + - 4 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "32" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml new file mode 100644 index 000000000..67e9fc568 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml @@ -0,0 +1,125 @@ +name: "ctx3_gen1_dep8_batch16_eplb0_mtp3" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 3 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 1 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16896 + max_seq_len: 8232 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 64 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "181" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml new file mode 100644 index 000000000..b0494f78f --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml @@ -0,0 +1,128 @@ +name: "ctx9_gen1_dep8_batch128_eplb0_mtp1" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 3 + prefill_workers: 9 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 1 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16896 + max_seq_len: 8232 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 128 + max_num_tokens: 256 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1197" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml new file mode 100644 index 000000000..5bc38c22a --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml @@ -0,0 +1,123 @@ +name: "ctx1_gen3_tep4_batch32_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 3 + decode_nodes: 2 + gpus_per_decode: 4 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16896 + max_seq_len: 8232 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + + decode: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "105" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml new file mode 100644 index 000000000..002aa9e27 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml @@ -0,0 +1,121 @@ +name: "ctx1_gen3_tep8_batch16_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 3 + decode_nodes: 3 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16896 + max_seq_len: 8232 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 16 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "63" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml new file mode 100644 index 000000000..5e8d96a80 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml @@ -0,0 +1,130 @@ +name: "ctx1_gen3_tep8_batch1_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 3 + decode_nodes: 3 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16896 + max_seq_len: 8232 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 1 + max_num_tokens: 1 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + - 1 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml new file mode 100644 index 000000000..df7612f99 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml @@ -0,0 +1,118 @@ +name: "ctx1_gen4_tep4_batch2_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 4 + decode_nodes: 2 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16896 + max_seq_len: 8232 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + + decode: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 2 + max_num_tokens: 2 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "12" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml new file mode 100644 index 000000000..b791d44b8 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml @@ -0,0 +1,120 @@ +name: "ctx5_gen2_dep8_batch32_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 2 + prefill_workers: 5 + gpus_per_prefill: 2 + + decode_workers: 2 + decode_nodes: 2 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16896 + max_seq_len: 8232 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "589" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml new file mode 100644 index 000000000..09b89137c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml @@ -0,0 +1,128 @@ +name: "ctx6_gen1_dep8_batch128_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 2 + prefill_workers: 6 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 1 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16896 + max_seq_len: 8232 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 128 + max_num_tokens: 128 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + - 128 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1093" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml new file mode 100644 index 000000000..0ca0d7692 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml @@ -0,0 +1,128 @@ +name: "ctx8_gen1_dep8_batch256_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 2 + prefill_workers: 8 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 1 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16896 + max_seq_len: 8232 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 256 + max_num_tokens: 256 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + - 256 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2048" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml new file mode 100644 index 000000000..cfa58f2a3 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml @@ -0,0 +1,133 @@ +name: ctx1_gen1_dp8_batch256_eplb0_mtp1_3072 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 1 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1280 + cuda_graph_config: + enable_padding: false + disable_overlap_scheduler: true + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + max_batch_size: 8 + max_num_tokens: 8320 + max_seq_len: 1280 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + tensor_parallel_size: 4 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1280 + cuda_graph_config: + enable_padding: true + max_batch_size: 256 + disable_overlap_scheduler: false + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 256 + max_num_tokens: 2100 + max_seq_len: 2400 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: [3072] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml new file mode 100644 index 000000000..866ccbb8e --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml @@ -0,0 +1,133 @@ +name: ctx1_gen2_dep8_batch128_eplb0_mtp1_2560 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 2 + decode_nodes: 2 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1280 + cuda_graph_config: + enable_padding: false + disable_overlap_scheduler: true + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + max_batch_size: 8 + max_num_tokens: 8320 + max_seq_len: 1280 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + tensor_parallel_size: 4 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1280 + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + disable_overlap_scheduler: false + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 128 + max_num_tokens: 1100 + max_seq_len: 2400 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: [2560] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml new file mode 100644 index 000000000..4e7600a2c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml @@ -0,0 +1,133 @@ +name: ctx1_gen5_dep8_batch16_eplb0_mtp2_720 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 5 + decode_nodes: 5 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1280 + cuda_graph_config: + enable_padding: false + disable_overlap_scheduler: true + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + max_batch_size: 8 + max_num_tokens: 8320 + max_seq_len: 1280 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 + tensor_parallel_size: 4 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1280 + cuda_graph_config: + enable_padding: true + max_batch_size: 16 + disable_overlap_scheduler: false + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 16 + max_num_tokens: 180 + max_seq_len: 2400 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: [720] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml new file mode 100644 index 000000000..a00639e26 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml @@ -0,0 +1,134 @@ +name: ctx1_gen8_tp8_batch16_eplb0_mtp3_160 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 8 + decode_nodes: 8 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1280 + cuda_graph_config: + enable_padding: false + disable_overlap_scheduler: true + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + max_batch_size: 8 + max_num_tokens: 8320 + max_seq_len: 1280 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 4 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1280 + cuda_graph_config: + enable_padding: true + max_batch_size: 16 + disable_overlap_scheduler: false + enable_attention_dp: false + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 16 + max_num_tokens: 384 + max_seq_len: 2400 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: [160] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml new file mode 100644 index 000000000..62ae3984f --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml @@ -0,0 +1,134 @@ +name: ctx1_gen8_tp8_batch1_eplb0_mtp3_10 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 8 + decode_nodes: 8 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1280 + cuda_graph_config: + enable_padding: false + disable_overlap_scheduler: true + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + max_batch_size: 8 + max_num_tokens: 8320 + max_seq_len: 1280 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 4 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1280 + cuda_graph_config: + enable_padding: true + max_batch_size: 1 + disable_overlap_scheduler: false + enable_attention_dp: false + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 1 + max_num_tokens: 4 + max_seq_len: 2400 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: [10] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml new file mode 100644 index 000000000..957676992 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml @@ -0,0 +1,133 @@ +name: ctx3_gen2_dp8_batch512_eplb0_mtp1_11264 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 2 + prefill_workers: 3 + gpus_per_prefill: 4 + + decode_workers: 2 + decode_nodes: 2 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1280 + cuda_graph_config: + enable_padding: false + disable_overlap_scheduler: true + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + max_batch_size: 8 + max_num_tokens: 8320 + max_seq_len: 1280 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + tensor_parallel_size: 4 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1280 + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + disable_overlap_scheduler: false + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 512 + max_num_tokens: 4200 + max_seq_len: 2400 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: [11264] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml new file mode 100644 index 000000000..f41079a54 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml @@ -0,0 +1,127 @@ +name: ctx1_gen1_dep8_batch256_eplb0_mtp0_2112 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 1 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1280 + cuda_graph_config: + enable_padding: false + disable_overlap_scheduler: true + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + max_batch_size: 8 + max_num_tokens: 8320 + max_seq_len: 1280 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 4 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1280 + cuda_graph_config: + enable_padding: true + max_batch_size: 256 + disable_overlap_scheduler: false + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 256 + max_num_tokens: 2048 + max_seq_len: 2400 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: [2112] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml new file mode 100644 index 000000000..7746b638c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml @@ -0,0 +1,127 @@ +name: ctx1_gen2_dp8_batch128_eplb0_mtp0_3072 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 2 + decode_nodes: 2 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1280 + cuda_graph_config: + enable_padding: false + disable_overlap_scheduler: true + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + max_batch_size: 8 + max_num_tokens: 8320 + max_seq_len: 1280 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 4 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1280 + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + disable_overlap_scheduler: false + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 128 + max_num_tokens: 1024 + max_seq_len: 2400 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: [3072] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml new file mode 100644 index 000000000..bdaef8f3e --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml @@ -0,0 +1,127 @@ +name: ctx1_gen3_dp8_batch48_eplb0_mtp0_1280 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 3 + decode_nodes: 3 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1280 + cuda_graph_config: + enable_padding: false + disable_overlap_scheduler: true + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + max_batch_size: 8 + max_num_tokens: 8320 + max_seq_len: 1280 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 4 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1280 + cuda_graph_config: + enable_padding: true + max_batch_size: 48 + disable_overlap_scheduler: false + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 48 + max_num_tokens: 384 + max_seq_len: 2400 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: [1280] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml new file mode 100644 index 000000000..f469bf3bc --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml @@ -0,0 +1,128 @@ +name: ctx1_gen8_tp8_batch64_eplb0_mtp0_12 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 8 + decode_nodes: 8 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1280 + cuda_graph_config: + enable_padding: false + disable_overlap_scheduler: true + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + max_batch_size: 8 + max_num_tokens: 8320 + max_seq_len: 1280 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 4 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1280 + cuda_graph_config: + enable_padding: true + max_batch_size: 1 + disable_overlap_scheduler: false + enable_attention_dp: false + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 1 + max_num_tokens: 1 + max_seq_len: 2400 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: [10] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml new file mode 100644 index 000000000..b3b2d8740 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml @@ -0,0 +1,128 @@ +name: ctx1_gen8_tp8_batch64_eplb0_mtp0_128 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 8 + decode_nodes: 8 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1280 + cuda_graph_config: + enable_padding: false + disable_overlap_scheduler: true + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + max_batch_size: 8 + max_num_tokens: 8320 + max_seq_len: 1280 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 4 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1280 + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + disable_overlap_scheduler: false + enable_attention_dp: false + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 64 + max_num_tokens: 64 + max_seq_len: 2400 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: [128] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml new file mode 100644 index 000000000..36476736b --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml @@ -0,0 +1,128 @@ +name: ctx1_gen8_tp8_batch64_eplb0_mtp0_384 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 8 + decode_nodes: 8 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1280 + cuda_graph_config: + enable_padding: false + disable_overlap_scheduler: true + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + max_batch_size: 8 + max_num_tokens: 8320 + max_seq_len: 1280 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 4 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1280 + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + disable_overlap_scheduler: false + enable_attention_dp: false + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 64 + max_num_tokens: 64 + max_seq_len: 2400 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: [384] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml new file mode 100644 index 000000000..c9d131239 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml @@ -0,0 +1,127 @@ +name: ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 2 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 1 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1280 + cuda_graph_config: + enable_padding: false + disable_overlap_scheduler: true + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + max_batch_size: 8 + max_num_tokens: 8320 + max_seq_len: 1280 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 4 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 1280 + cuda_graph_config: + enable_padding: true + max_batch_size: 1024 + disable_overlap_scheduler: false + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 1024 + max_num_tokens: 8192 + max_seq_len: 2400 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: [16384] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml new file mode 100644 index 000000000..7e806469c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml @@ -0,0 +1,133 @@ +name: ctx1_gen1_dp8_batch8_eplb0_mtp3_72 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 1 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: false + disable_overlap_scheduler: true + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + max_batch_size: 8 + max_num_tokens: 8320 + max_seq_len: 8320 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 4 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: true + max_batch_size: 8 + disable_overlap_scheduler: false + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 8 + max_num_tokens: 90 + max_seq_len: 9344 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: [72] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml new file mode 100644 index 000000000..c203b724a --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml @@ -0,0 +1,134 @@ +name: ctx1_gen2_tp8_batch16_eplb0_mtp3_40 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 2 + decode_nodes: 2 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: false + disable_overlap_scheduler: true + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + max_batch_size: 8 + max_num_tokens: 8320 + max_seq_len: 8320 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 4 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: true + max_batch_size: 16 + disable_overlap_scheduler: false + enable_attention_dp: false + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 16 + max_num_tokens: 80 + max_seq_len: 9344 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: [40] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml new file mode 100644 index 000000000..48773bf14 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml @@ -0,0 +1,134 @@ +name: ctx1_gen4_tp8_batch1_eplb0_mtp3_8 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 4 + decode_nodes: 4 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: false + disable_overlap_scheduler: true + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + max_batch_size: 8 + max_num_tokens: 8320 + max_seq_len: 8320 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 4 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: true + max_batch_size: 1 + disable_overlap_scheduler: false + enable_attention_dp: false + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 1 + max_num_tokens: 4 + max_seq_len: 9344 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: [5] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml new file mode 100644 index 000000000..bba0d5a65 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml @@ -0,0 +1,134 @@ +name: ctx1_gen4_tp8_batch4_eplb0_mtp3_20 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 4 + decode_nodes: 4 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: false + disable_overlap_scheduler: true + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + max_batch_size: 8 + max_num_tokens: 8320 + max_seq_len: 8320 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 4 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + disable_overlap_scheduler: false + enable_attention_dp: false + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 4 + max_num_tokens: 20 + max_seq_len: 9344 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: [20] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml new file mode 100644 index 000000000..9511ede04 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml @@ -0,0 +1,133 @@ +name: ctx2_gen1_dp8_batch16_eplb0_mtp3_144 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 2 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 1 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: false + disable_overlap_scheduler: true + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + max_batch_size: 8 + max_num_tokens: 8320 + max_seq_len: 8320 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 4 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: true + max_batch_size: 16 + disable_overlap_scheduler: false + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 16 + max_num_tokens: 180 + max_seq_len: 9344 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: [144] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml new file mode 100644 index 000000000..7513770d8 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml @@ -0,0 +1,133 @@ +name: ctx4_gen1_dp8_batch64_eplb0_mtp2_512 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "b300" + prefill_nodes: 2 + prefill_workers: 4 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 1 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: false + disable_overlap_scheduler: true + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + max_batch_size: 8 + max_num_tokens: 8320 + max_seq_len: 8320 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 + tensor_parallel_size: 4 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + disable_overlap_scheduler: false + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 64 + max_num_tokens: 650 + max_seq_len: 9344 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: [512] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml new file mode 100644 index 000000000..2852df6c3 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml @@ -0,0 +1,128 @@ +name: ctx1_gen4_tp8_batch16_eplb0_mtp0_64 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 4 + decode_nodes: 4 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: false + disable_overlap_scheduler: true + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + max_batch_size: 8 + max_num_tokens: 8320 + max_seq_len: 8320 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 4 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: true + max_batch_size: 16 + disable_overlap_scheduler: false + enable_attention_dp: false + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 16 + max_num_tokens: 512 + max_seq_len: 9344 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: [64] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml new file mode 100644 index 000000000..68ae8f4dc --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml @@ -0,0 +1,128 @@ +name: ctx1_gen8_tp8_batch2_eplb0_mtp0_16 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 8 + decode_nodes: 8 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: false + disable_overlap_scheduler: true + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + max_batch_size: 8 + max_num_tokens: 8320 + max_seq_len: 8320 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 4 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: true + max_batch_size: 1 + disable_overlap_scheduler: false + enable_attention_dp: false + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 1 + max_num_tokens: 1 + max_seq_len: 9344 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: [10] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml new file mode 100644 index 000000000..1c2977396 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml @@ -0,0 +1,127 @@ +name: ctx2_gen1_dp8_batch32_eplb0_mtp0_256 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b300" + prefill_nodes: 1 + prefill_workers: 2 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 1 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: false + disable_overlap_scheduler: true + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + max_batch_size: 8 + max_num_tokens: 8320 + max_seq_len: 8320 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 4 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: true + max_batch_size: 32 + disable_overlap_scheduler: false + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 32 + max_num_tokens: 512 + max_seq_len: 9344 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: [256] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml new file mode 100644 index 000000000..343b25905 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml @@ -0,0 +1,127 @@ +name: ctx3_gen1_dp8_batch64_eplb0_mtp0_512 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b300" + prefill_nodes: 2 + prefill_workers: 3 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 1 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: false + disable_overlap_scheduler: true + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + max_batch_size: 8 + max_num_tokens: 8320 + max_seq_len: 8320 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 4 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + disable_overlap_scheduler: false + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 64 + max_num_tokens: 512 + max_seq_len: 9344 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: [512] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml new file mode 100644 index 000000000..5aa5546ab --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml @@ -0,0 +1,128 @@ +name: ctx3_gen5_tp8_batch64_eplb0_mtp0_256 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b300" + prefill_nodes: 2 + prefill_workers: 3 + gpus_per_prefill: 4 + + decode_workers: 5 + decode_nodes: 5 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: false + disable_overlap_scheduler: true + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + max_batch_size: 8 + max_num_tokens: 8320 + max_seq_len: 8320 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 4 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + disable_overlap_scheduler: false + enable_attention_dp: false + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 64 + max_num_tokens: 512 + max_seq_len: 9344 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: [256] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml new file mode 100644 index 000000000..df8c2831c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml @@ -0,0 +1,127 @@ +name: ctx5_gen1_dp8_batch128_eplb0_mtp0_1075 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b300" + prefill_nodes: 3 + prefill_workers: 5 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 1 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: false + disable_overlap_scheduler: true + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + max_batch_size: 8 + max_num_tokens: 8320 + max_seq_len: 8320 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 4 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + disable_overlap_scheduler: false + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 128 + max_num_tokens: 512 + max_seq_len: 9344 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: [1075] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml new file mode 100644 index 000000000..9b0df56e9 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml @@ -0,0 +1,127 @@ +name: ctx7_gen1_dep8_batch384_eplb0_mtp0_3072 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "b300" + prefill_nodes: 4 + prefill_workers: 7 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 1 + gpus_per_decode: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + decode_environment: + NCCL_GRAPH_MIXING_SUPPORT: "0" + OMPI_MCA_coll_ucc_enable: "0" + TLLM_ALL_RANK_LOG: "1" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + UCX_MAX_RMA_RAILS: "1" + UCX_MAX_RNDV_RAILS: "1" + UCX_RNDV_SCHEME: "put_zcopy" + OMPI_MCA_btl: "tcp,self" + OMPI_MCA_pml: "ob1" + TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1" + + trtllm_config: + prefill: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: false + disable_overlap_scheduler: true + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + max_batch_size: 8 + max_num_tokens: 8320 + max_seq_len: 8320 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 4 + + + decode: + allreduce_strategy: AUTO + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8320 + cuda_graph_config: + enable_padding: true + max_batch_size: 384 + disable_overlap_scheduler: false + enable_attention_dp: true + enable_iter_perf_stats: false + enable_iter_req_stats: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 384 + max_num_tokens: 512 + max_seq_len: 9344 + moe_config: + backend: TRTLLM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: [3072] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml new file mode 100644 index 000000000..a8f90e9bd --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml @@ -0,0 +1,117 @@ +name: "ctx1_gen1_dep32_batch4_eplb0_mtp3" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb200" + prefill_nodes: 1 + prefill_workers: 1 + + decode_workers: 1 + decode_nodes: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 2048 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 4 + max_num_tokens: 16 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "180" + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml new file mode 100644 index 000000000..be4f29045 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml @@ -0,0 +1,121 @@ +name: "ctx1_gen4_tep8_batch8_eplb0_mtp3" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb200" + prefill_nodes: 1 + prefill_workers: 1 + + decode_workers: 4 + decode_nodes: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 2048 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 8 + max_num_tokens: 32 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4x8x12x24x48" + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml new file mode 100644 index 000000000..5dd8a302b --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml @@ -0,0 +1,152 @@ +name: "ctx2_gen1_dep16_batch256_eplb256_mtp1" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb200" + prefill_nodes: 2 + prefill_workers: 2 + + decode_workers: 1 + decode_nodes: 4 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 2048 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 256 + max_num_tokens: 512 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + num_slots: 256 + layer_updates_per_iter: 1 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4301" + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml new file mode 100644 index 000000000..08fc612ec --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml @@ -0,0 +1,128 @@ +name: "ctx3_gen1_dep32_batch64_eplb288_mtp1" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb200" + prefill_nodes: 3 + prefill_workers: 3 + + decode_workers: 1 + decode_nodes: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 2048 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 128 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + num_slots: 288 + layer_updates_per_iter: 1 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "2253" + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml new file mode 100644 index 000000000..44a05c484 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml @@ -0,0 +1,213 @@ +name: "ctx3_gen5_dep4_batch768_eplb0_mtp1" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb200" + prefill_nodes: 3 + prefill_workers: 3 + + decode_workers: 5 + decode_nodes: 5 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 2048 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + + decode: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 768 + max_num_tokens: 1536 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + - 264 + - 272 + - 280 + - 288 + - 296 + - 304 + - 312 + - 320 + - 328 + - 336 + - 344 + - 352 + - 360 + - 368 + - 376 + - 384 + - 392 + - 400 + - 408 + - 416 + - 424 + - 432 + - 440 + - 448 + - 456 + - 464 + - 472 + - 480 + - 488 + - 496 + - 504 + - 512 + - 520 + - 528 + - 536 + - 544 + - 552 + - 560 + - 568 + - 576 + - 584 + - 592 + - 600 + - 608 + - 616 + - 624 + - 632 + - 640 + - 648 + - 656 + - 664 + - 672 + - 680 + - 688 + - 696 + - 704 + - 712 + - 720 + - 728 + - 736 + - 744 + - 752 + - 760 + - 768 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "16130" + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml new file mode 100644 index 000000000..c353c3df0 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml @@ -0,0 +1,113 @@ +name: "ctx1_gen1_dep32_batch16_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb200" + prefill_nodes: 1 + prefill_workers: 1 + + decode_workers: 1 + decode_nodes: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 2048 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 16 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "666" + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml new file mode 100644 index 000000000..a62b540d9 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml @@ -0,0 +1,175 @@ +name: "ctx1_gen1_dep8_batch512_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb200" + prefill_nodes: 1 + prefill_workers: 1 + + decode_workers: 1 + decode_nodes: 2 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 2048 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 512 + max_num_tokens: 512 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + - 264 + - 272 + - 280 + - 288 + - 296 + - 304 + - 312 + - 320 + - 328 + - 336 + - 344 + - 352 + - 360 + - 368 + - 376 + - 384 + - 392 + - 400 + - 408 + - 416 + - 424 + - 432 + - 440 + - 448 + - 456 + - 464 + - 472 + - 480 + - 488 + - 496 + - 504 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4301" + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml new file mode 100644 index 000000000..d56eba13c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml @@ -0,0 +1,207 @@ +name: "ctx1_gen2_dep4_batch768_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb200" + prefill_nodes: 1 + prefill_workers: 1 + + decode_workers: 2 + decode_nodes: 2 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 2048 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + + decode: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 768 + max_num_tokens: 768 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + - 264 + - 272 + - 280 + - 288 + - 296 + - 304 + - 312 + - 320 + - 328 + - 336 + - 344 + - 352 + - 360 + - 368 + - 376 + - 384 + - 392 + - 400 + - 408 + - 416 + - 424 + - 432 + - 440 + - 448 + - 456 + - 464 + - 472 + - 480 + - 488 + - 496 + - 504 + - 512 + - 520 + - 528 + - 536 + - 544 + - 552 + - 560 + - 568 + - 576 + - 584 + - 592 + - 600 + - 608 + - 616 + - 624 + - 632 + - 640 + - 648 + - 656 + - 664 + - 672 + - 680 + - 688 + - 696 + - 704 + - 712 + - 720 + - 728 + - 736 + - 744 + - 752 + - 760 + - 768 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "6144" + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml new file mode 100644 index 000000000..94a45661b --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml @@ -0,0 +1,110 @@ +name: "ctx1_gen4_tep8_batch1_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb200" + prefill_nodes: 1 + prefill_workers: 1 + + decode_workers: 4 + decode_nodes: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 2048 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 1 + max_num_tokens: 1 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "5" + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml new file mode 100644 index 000000000..a93c86f82 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml @@ -0,0 +1,125 @@ +name: "ctx1_gen4_tep8_batch32_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb200" + prefill_nodes: 1 + prefill_workers: 1 + + decode_workers: 4 + decode_nodes: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 2048 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 8 + - 9 + - 10 + - 11 + - 12 + - 16 + - 22 + - 23 + - 24 + - 32 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "12x24x48x96x192" + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml new file mode 100644 index 000000000..9aa57eb46 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml @@ -0,0 +1,146 @@ +name: "ctx2_gen1_dep16_batch256_eplb256_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb200" + prefill_nodes: 2 + prefill_workers: 2 + + decode_workers: 1 + decode_nodes: 4 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 2048 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 256 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + num_slots: 256 + layer_updates_per_iter: 1 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4301" + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml new file mode 100644 index 000000000..3501708c2 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml @@ -0,0 +1,119 @@ +name: "ctx2_gen1_dep32_batch64_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb200" + prefill_nodes: 2 + prefill_workers: 2 + + decode_workers: 1 + decode_nodes: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 2048 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 64 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "2253" + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml new file mode 100644 index 000000000..0a88341a1 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml @@ -0,0 +1,152 @@ +name: "ctx11_gen1_dep16_batch256_eplb256_mtp1" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb200" + prefill_nodes: 11 + prefill_workers: 11 + + decode_workers: 1 + decode_nodes: 4 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 256 + max_num_tokens: 512 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + num_slots: 256 + layer_updates_per_iter: 1 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4301" + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml new file mode 100644 index 000000000..b4dd6005d --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml @@ -0,0 +1,123 @@ +name: "ctx1_gen4_tep8_batch8_eplb0_mtp3" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb200" + prefill_nodes: 1 + prefill_workers: 1 + + decode_workers: 4 + decode_nodes: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 8 + max_num_tokens: 32 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4x8x12x24x48" + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml new file mode 100644 index 000000000..9374538f8 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml @@ -0,0 +1,117 @@ +name: "ctx3_gen1_dep32_batch4_eplb0_mtp3" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb200" + prefill_nodes: 3 + prefill_workers: 3 + + decode_workers: 1 + decode_nodes: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 4 + max_num_tokens: 16 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "180" + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml new file mode 100644 index 000000000..a62e4f24f --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml @@ -0,0 +1,128 @@ +name: "ctx7_gen1_dep16_batch64_eplb256_mtp1" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb200" + prefill_nodes: 7 + prefill_workers: 7 + + decode_workers: 1 + decode_nodes: 4 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 128 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + num_slots: 256 + layer_updates_per_iter: 1 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1229" + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml new file mode 100644 index 000000000..ee3082fe5 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml @@ -0,0 +1,119 @@ +name: "ctx8_gen1_dep32_batch16_eplb0_mtp3" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb200" + prefill_nodes: 8 + prefill_workers: 8 + + decode_workers: 1 + decode_nodes: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 64 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "666" + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml new file mode 100644 index 000000000..4df408491 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml @@ -0,0 +1,146 @@ +name: "ctx10_gen1_dep16_batch256_eplb256_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb200" + prefill_nodes: 10 + prefill_workers: 10 + + decode_workers: 1 + decode_nodes: 4 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 256 + max_num_tokens: 256 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + num_slots: 256 + layer_updates_per_iter: 1 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml new file mode 100644 index 000000000..4b603ad67 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml @@ -0,0 +1,121 @@ +name: "ctx1_gen4_tep8_batch16_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb200" + prefill_nodes: 1 + prefill_workers: 1 + + decode_workers: 4 + decode_nodes: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 16 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 3 + - 4 + - 6 + - 8 + - 9 + - 10 + - 11 + - 14 + - 15 + - 16 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "12x44x76" + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml new file mode 100644 index 000000000..1ee953844 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml @@ -0,0 +1,112 @@ +name: "ctx1_gen4_tep8_batch1_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb200" + prefill_nodes: 1 + prefill_workers: 1 + + decode_workers: 4 + decode_nodes: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 1 + max_num_tokens: 1 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "5" + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml new file mode 100644 index 000000000..b08791f00 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml @@ -0,0 +1,112 @@ +name: "ctx2_gen1_dep32_batch8_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb200" + prefill_nodes: 2 + prefill_workers: 2 + + decode_workers: 1 + decode_nodes: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 8 + max_num_tokens: 8 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + moe_config: + backend: WIDEEP + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "333" + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml new file mode 100644 index 000000000..7f4e9594e --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml @@ -0,0 +1,115 @@ +name: "ctx7_gen1_dep32_batch32_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb200" + prefill_nodes: 7 + prefill_workers: 7 + + decode_workers: 1 + decode_nodes: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1229" + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml new file mode 100644 index 000000000..059688716 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml @@ -0,0 +1,127 @@ +name: "ctx8_gen1_dep16_batch128_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb200" + prefill_nodes: 8 + prefill_workers: 8 + + decode_workers: 1 + decode_nodes: 4 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 128 + max_num_tokens: 128 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2253" + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml new file mode 100644 index 000000000..ba7f2ff21 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml @@ -0,0 +1,127 @@ +name: ctx1_gen1_dep16_batch64_eplb0_mtp1_1229 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 2 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.5 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + tensor_parallel_size: 8 + + + decode: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + enable_padding: true + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + max_batch_size: 64 + max_num_tokens: 128 + max_seq_len: 2088 + moe_config: + backend: DEEPGEMM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 16 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + stream_interval: 100 + tensor_parallel_size: 16 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: ['1229'] + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml new file mode 100644 index 000000000..218b85744 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml @@ -0,0 +1,121 @@ +name: ctx1_gen1_dep32_batch16_eplb0_mtp3_615 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 2 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.5 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 8 + + + decode: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + enable_padding: true + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + max_batch_size: 16 + max_num_tokens: 64 + max_seq_len: 2088 + moe_config: + backend: DEEPGEMM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 32 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 100 + tensor_parallel_size: 32 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: ['615'] + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml new file mode 100644 index 000000000..fe49d8959 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml @@ -0,0 +1,151 @@ +name: ctx1_gen1_dep8_batch256_eplb0_mtp1_2151 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 2 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 1 + decode_nodes: 2 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.5 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + tensor_parallel_size: 8 + + + decode: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + enable_padding: true + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + max_batch_size: 256 + max_num_tokens: 512 + max_seq_len: 2088 + moe_config: + backend: DEEPGEMM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 8 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + stream_interval: 100 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: ['2151'] + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml new file mode 100644 index 000000000..25847ed23 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml @@ -0,0 +1,183 @@ +name: ctx1_gen1_dep8_batch512_eplb0_mtp1_4301 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 2 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 1 + decode_nodes: 2 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.5 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + tensor_parallel_size: 8 + + + decode: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + - 264 + - 272 + - 280 + - 288 + - 296 + - 304 + - 312 + - 320 + - 328 + - 336 + - 344 + - 352 + - 360 + - 368 + - 376 + - 384 + - 392 + - 400 + - 408 + - 416 + - 424 + - 432 + - 440 + - 448 + - 456 + - 464 + - 472 + - 480 + - 488 + - 496 + - 504 + - 512 + enable_padding: true + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + max_batch_size: 512 + max_num_tokens: 1024 + max_seq_len: 2088 + moe_config: + backend: DEEPGEMM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 8 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + stream_interval: 100 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: ['4301'] + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml new file mode 100644 index 000000000..62d4be838 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml @@ -0,0 +1,120 @@ +name: ctx1_gen3_tep8_batch2_eplb0_mtp3_9 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 2 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 3 + decode_nodes: 6 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.5 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: MNNVL + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + enable_padding: true + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + max_batch_size: 2 + max_num_tokens: 8 + max_seq_len: 2088 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 8 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 100 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: ['9'] + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml new file mode 100644 index 000000000..47f21d46b --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml @@ -0,0 +1,120 @@ +name: ctx1_gen3_tep8_batch4_eplb0_mtp3_18 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 2 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 3 + decode_nodes: 6 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.5 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: MNNVL + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + enable_padding: true + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + max_batch_size: 4 + max_num_tokens: 16 + max_seq_len: 2088 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 8 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 100 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: ['18'] + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml new file mode 100644 index 000000000..ecb7c92cd --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml @@ -0,0 +1,121 @@ +name: ctx1_gen3_tep8_batch8_eplb0_mtp3_36 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 2 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 3 + decode_nodes: 6 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.5 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: MNNVL + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + - 8 + enable_padding: true + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + max_batch_size: 8 + max_num_tokens: 32 + max_seq_len: 2088 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 8 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 100 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: ['36'] + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml new file mode 100644 index 000000000..47b869af5 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml @@ -0,0 +1,129 @@ +name: ctx1_gen1_dep16_batch128_eplb0_mtp0_2151 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 2 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.5 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 8 + + + decode: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + enable_padding: true + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + max_batch_size: 128 + max_num_tokens: 128 + max_seq_len: 2088 + moe_config: + backend: DEEPGEMM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 16 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 100 + tensor_parallel_size: 16 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: ['2151'] + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml new file mode 100644 index 000000000..d1e3cae50 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml @@ -0,0 +1,117 @@ +name: ctx1_gen1_dep32_batch32_eplb0_mtp0_1127 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 2 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.5 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 8 + + + decode: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + enable_padding: true + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 2088 + moe_config: + backend: DEEPGEMM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 32 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 100 + tensor_parallel_size: 32 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: ['1127'] + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml new file mode 100644 index 000000000..c48edbd5f --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml @@ -0,0 +1,114 @@ +name: ctx1_gen1_dep32_batch8_eplb0_mtp0_256 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 2 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.5 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 8 + + + decode: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + - 8 + enable_padding: true + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + max_batch_size: 8 + max_num_tokens: 8 + max_seq_len: 2088 + moe_config: + backend: DEEPGEMM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 32 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 100 + tensor_parallel_size: 32 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: ['256'] + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml new file mode 100644 index 000000000..08139cf82 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml @@ -0,0 +1,177 @@ +name: ctx1_gen1_dep8_batch512_eplb0_mtp0_4301 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 2 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 1 + decode_nodes: 2 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.5 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 8 + + + decode: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + - 264 + - 272 + - 280 + - 288 + - 296 + - 304 + - 312 + - 320 + - 328 + - 336 + - 344 + - 352 + - 360 + - 368 + - 376 + - 384 + - 392 + - 400 + - 408 + - 416 + - 424 + - 432 + - 440 + - 448 + - 456 + - 464 + - 472 + - 480 + - 488 + - 496 + - 504 + - 512 + enable_padding: true + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 512 + max_num_tokens: 512 + max_seq_len: 2088 + moe_config: + backend: DEEPGEMM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 8 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 100 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: ['4301'] + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml new file mode 100644 index 000000000..14b33599c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml @@ -0,0 +1,209 @@ +name: ctx1_gen1_dep8_batch768_eplb0_mtp0_6144 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 2 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 1 + decode_nodes: 2 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.5 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 8 + + + decode: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + - 264 + - 272 + - 280 + - 288 + - 296 + - 304 + - 312 + - 320 + - 328 + - 336 + - 344 + - 352 + - 360 + - 368 + - 376 + - 384 + - 392 + - 400 + - 408 + - 416 + - 424 + - 432 + - 440 + - 448 + - 456 + - 464 + - 472 + - 480 + - 488 + - 496 + - 504 + - 512 + - 520 + - 528 + - 536 + - 544 + - 552 + - 560 + - 568 + - 576 + - 584 + - 592 + - 600 + - 608 + - 616 + - 624 + - 632 + - 640 + - 648 + - 656 + - 664 + - 672 + - 680 + - 688 + - 696 + - 704 + - 712 + - 720 + - 728 + - 736 + - 744 + - 752 + - 760 + - 768 + enable_padding: true + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 768 + max_num_tokens: 768 + max_seq_len: 2088 + moe_config: + backend: DEEPGEMM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 8 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 100 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: ['6144'] + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml new file mode 100644 index 000000000..2b9250430 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml @@ -0,0 +1,114 @@ +name: ctx1_gen3_tep8_batch1_eplb0_mtp0_3 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 2 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 3 + decode_nodes: 6 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.5 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: MNNVL + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + enable_padding: true + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + max_batch_size: 1 + max_num_tokens: 1 + max_seq_len: 2088 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 8 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 100 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: ['3'] + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml new file mode 100644 index 000000000..160f4c6ca --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml @@ -0,0 +1,115 @@ +name: ctx1_gen3_tep8_batch8_eplb0_mtp0_27 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 2 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 3 + decode_nodes: 6 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.5 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: MNNVL + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + - 8 + enable_padding: true + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + max_batch_size: 8 + max_num_tokens: 8 + max_seq_len: 2088 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 8 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 100 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: ['27'] + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml new file mode 100644 index 000000000..8f305ced0 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml @@ -0,0 +1,120 @@ +name: ctx1_gen3_tep8_batch2_eplb0_mtp3_6 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 2 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 3 + decode_nodes: 6 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: MNNVL + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + enable_padding: true + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 2 + max_num_tokens: 8 + max_seq_len: 9256 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 8 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 100 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: ['6'] + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml new file mode 100644 index 000000000..bea950ac7 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml @@ -0,0 +1,120 @@ +name: ctx1_gen3_tep8_batch4_eplb0_mtp3_15 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 2 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 3 + decode_nodes: 6 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: MNNVL + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + enable_padding: true + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 4 + max_num_tokens: 16 + max_seq_len: 9256 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 8 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 100 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: ['15'] + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml new file mode 100644 index 000000000..fbf861990 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml @@ -0,0 +1,119 @@ +name: ctx2_gen1_dep32_batch2_eplb0_mtp3_90 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 4 + prefill_workers: 2 + gpus_per_prefill: 8 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 8 + + + decode: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + enable_padding: true + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + max_batch_size: 2 + max_num_tokens: 8 + max_seq_len: 9256 + moe_config: + backend: DEEPGEMM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 32 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 100 + tensor_parallel_size: 32 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: ['90'] + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml new file mode 100644 index 000000000..ea8a7d013 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml @@ -0,0 +1,121 @@ +name: ctx3_gen1_dep16_batch16_eplb0_mtp3_333 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 6 + prefill_workers: 3 + gpus_per_prefill: 8 + + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 8 + + + decode: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + enable_padding: true + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + max_batch_size: 16 + max_num_tokens: 64 + max_seq_len: 9256 + moe_config: + backend: DEEPGEMM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 16 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 100 + tensor_parallel_size: 16 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: ['333'] + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml new file mode 100644 index 000000000..2ad2e727d --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml @@ -0,0 +1,127 @@ +name: ctx3_gen1_dep8_batch64_eplb0_mtp3_666 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 6 + prefill_workers: 3 + gpus_per_prefill: 8 + + decode_workers: 1 + decode_nodes: 2 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 8 + + + decode: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + enable_padding: true + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 64 + max_num_tokens: 256 + max_seq_len: 9256 + moe_config: + backend: DEEPGEMM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 8 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 100 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: ['666'] + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml new file mode 100644 index 000000000..95bf6192f --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml @@ -0,0 +1,120 @@ +name: ctx4_gen1_dep32_batch8_eplb0_mtp3_333 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 8 + prefill_workers: 4 + gpus_per_prefill: 8 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 8 + + + decode: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + - 8 + enable_padding: true + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + max_batch_size: 8 + max_num_tokens: 32 + max_seq_len: 9256 + moe_config: + backend: DEEPGEMM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 32 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 100 + tensor_parallel_size: 32 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: ['333'] + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml new file mode 100644 index 000000000..35da2b70f --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml @@ -0,0 +1,123 @@ +name: ctx5_gen1_dep16_batch32_eplb0_mtp3_666 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 10 + prefill_workers: 5 + gpus_per_prefill: 8 + + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 8 + + + decode: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + enable_padding: true + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 9256 + moe_config: + backend: DEEPGEMM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 16 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 100 + tensor_parallel_size: 16 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: ['666'] + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml new file mode 100644 index 000000000..178a3b7df --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml @@ -0,0 +1,116 @@ +name: ctx1_gen3_tep8_batch16_eplb0_mtp0_63 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 2 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 3 + decode_nodes: 6 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: MNNVL + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + enable_padding: true + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + max_batch_size: 16 + max_num_tokens: 16 + max_seq_len: 9256 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 8 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 100 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: ['63'] + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml new file mode 100644 index 000000000..f33813fd9 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml @@ -0,0 +1,114 @@ +name: ctx1_gen3_tep8_batch1_eplb0_mtp0_6 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 2 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 3 + decode_nodes: 6 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: MNNVL + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + enable_padding: true + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + max_batch_size: 1 + max_num_tokens: 1 + max_seq_len: 9256 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 8 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 100 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: ['6'] + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml new file mode 100644 index 000000000..98aee313b --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml @@ -0,0 +1,114 @@ +name: ctx1_gen3_tep8_batch4_eplb0_mtp0_18 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 2 + prefill_workers: 1 + gpus_per_prefill: 8 + + decode_workers: 3 + decode_nodes: 6 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 8 + + + decode: + allreduce_strategy: MNNVL + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + enable_padding: true + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + max_batch_size: 4 + max_num_tokens: 4 + max_seq_len: 9256 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 8 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 100 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: ['18'] + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml new file mode 100644 index 000000000..816065639 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml @@ -0,0 +1,114 @@ +name: ctx2_gen1_dep32_batch8_eplb0_mtp0_333 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 4 + prefill_workers: 2 + gpus_per_prefill: 8 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 8 + + + decode: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + - 8 + enable_padding: true + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + max_batch_size: 8 + max_num_tokens: 8 + max_seq_len: 9256 + moe_config: + backend: DEEPGEMM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 32 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 100 + tensor_parallel_size: 32 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: ['333'] + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml new file mode 100644 index 000000000..f7d87c1b3 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml @@ -0,0 +1,117 @@ +name: ctx3_gen1_dep16_batch32_eplb0_mtp0_615 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 6 + prefill_workers: 3 + gpus_per_prefill: 8 + + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 8 + + + decode: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + enable_padding: true + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 9256 + moe_config: + backend: DEEPGEMM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 16 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 100 + tensor_parallel_size: 16 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: ['615'] + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml new file mode 100644 index 000000000..27a19e5b8 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml @@ -0,0 +1,115 @@ +name: ctx4_gen1_dep32_batch16_eplb0_mtp0_666 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 8 + prefill_workers: 4 + gpus_per_prefill: 8 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 8 + + + decode: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + enable_padding: true + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + max_batch_size: 16 + max_num_tokens: 16 + max_seq_len: 9256 + moe_config: + backend: DEEPGEMM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 32 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 100 + tensor_parallel_size: 32 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: ['666'] + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml new file mode 100644 index 000000000..634f07cdb --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml @@ -0,0 +1,121 @@ +name: ctx5_gen1_dep16_batch64_eplb0_mtp0_1229 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb200" + prefill_nodes: 10 + prefill_workers: 5 + gpus_per_prefill: 8 + + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 8 + + + decode: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + enable_padding: true + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 64 + max_num_tokens: 64 + max_seq_len: 9256 + moe_config: + backend: DEEPGEMM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 16 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 100 + tensor_parallel_size: 16 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: ['1229'] + req_rate: "inf" + +frontend: + type: "dynamo" + nginx_container: "nginx-sqsh" + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml new file mode 100644 index 000000000..b4434cdda --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml @@ -0,0 +1,121 @@ +name: "ctx1_gen1_dep32_batch8_eplb0_mtp" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 2048 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 8 + max_num_tokens: 32 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "333" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml new file mode 100644 index 000000000..e264a1796 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml @@ -0,0 +1,216 @@ +name: "ctx1_gen1_dep4_batch768_eplb0_mtp1" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 1 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 2048 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + + decode: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 768 + max_num_tokens: 1536 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + - 264 + - 272 + - 280 + - 288 + - 296 + - 304 + - 312 + - 320 + - 328 + - 336 + - 344 + - 352 + - 360 + - 368 + - 376 + - 384 + - 392 + - 400 + - 408 + - 416 + - 424 + - 432 + - 440 + - 448 + - 456 + - 464 + - 472 + - 480 + - 488 + - 496 + - 504 + - 512 + - 520 + - 528 + - 536 + - 544 + - 552 + - 560 + - 568 + - 576 + - 584 + - 592 + - 600 + - 608 + - 616 + - 624 + - 632 + - 640 + - 648 + - 656 + - 664 + - 672 + - 680 + - 688 + - 696 + - 704 + - 712 + - 720 + - 728 + - 736 + - 744 + - 752 + - 760 + - 768 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "3226" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml new file mode 100644 index 000000000..67c672ffb --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml @@ -0,0 +1,119 @@ +name: "ctx1_gen4_tep8_batch1_eplb0_mtp3" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 4 + decode_nodes: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 2048 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 1 + max_num_tokens: 4 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "5" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml new file mode 100644 index 000000000..aab184727 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml @@ -0,0 +1,124 @@ +name: "ctx1_gen4_tep8_batch8_eplb0_mtp3" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 4 + decode_nodes: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 2048 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 8 + max_num_tokens: 32 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 3 + - 4 + - 6 + - 8 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "8x12x24x48" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml new file mode 100644 index 000000000..58cbacdf4 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml @@ -0,0 +1,139 @@ +name: "ctx3_gen1_dep16_batch128_eplb256_mtp1" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 2 + prefill_workers: 3 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 4 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 2048 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 128 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + num_slots: 256 + layer_updates_per_iter: 1 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "2253" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml new file mode 100644 index 000000000..698989630 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml @@ -0,0 +1,127 @@ +name: "ctx3_gen1_dep32_batch32_eplb288_mtp3" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 2 + prefill_workers: 3 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 2048 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + num_slots: 288 + layer_updates_per_iter: 1 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1229" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml new file mode 100644 index 000000000..642aa6c43 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml @@ -0,0 +1,113 @@ +name: "ctx1_gen4_tep8_batch1_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 4 + decode_nodes: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 2048 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 1 + max_num_tokens: 1 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "5" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml new file mode 100644 index 000000000..44774b6bc --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml @@ -0,0 +1,127 @@ +name: "ctx1_gen4_tep8_batch32_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 4 + decode_nodes: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 2048 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 3 + - 4 + - 6 + - 8 + - 10 + - 11 + - 12 + - 16 + - 18 + - 20 + - 24 + - 28 + - 32 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "12x48x96x192" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml new file mode 100644 index 000000000..ffc2850fb --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml @@ -0,0 +1,117 @@ +name: "ctx2_gen1_dep32_batch32_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + prefill_workers: 2 + + decode_workers: 1 + decode_nodes: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 2048 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1229" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml new file mode 100644 index 000000000..28e148d02 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml @@ -0,0 +1,241 @@ +name: "ctx2_gen1_dep8_batch1024_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + prefill_workers: 2 + + decode_workers: 1 + decode_nodes: 2 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 2048 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 1024 + max_num_tokens: 1024 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + - 264 + - 272 + - 280 + - 288 + - 296 + - 304 + - 312 + - 320 + - 328 + - 336 + - 344 + - 352 + - 360 + - 368 + - 376 + - 384 + - 392 + - 400 + - 408 + - 416 + - 424 + - 432 + - 440 + - 448 + - 456 + - 464 + - 472 + - 480 + - 488 + - 496 + - 504 + - 512 + - 520 + - 528 + - 536 + - 544 + - 552 + - 560 + - 568 + - 576 + - 584 + - 592 + - 600 + - 608 + - 616 + - 624 + - 632 + - 640 + - 648 + - 656 + - 664 + - 672 + - 680 + - 688 + - 696 + - 704 + - 712 + - 720 + - 728 + - 736 + - 744 + - 752 + - 760 + - 768 + - 776 + - 784 + - 792 + - 800 + - 808 + - 816 + - 824 + - 832 + - 840 + - 848 + - 856 + - 864 + - 872 + - 880 + - 888 + - 896 + - 904 + - 912 + - 920 + - 928 + - 936 + - 944 + - 952 + - 960 + - 968 + - 976 + - 984 + - 992 + - 1000 + - 1008 + - 1016 + - 1024 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "8192" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml new file mode 100644 index 000000000..4d4ffe594 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml @@ -0,0 +1,149 @@ +name: "ctx3_gen1_dep16_batch256_eplb256_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 2 + prefill_workers: 3 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 4 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 2048 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 256 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + num_slots: 256 + layer_updates_per_iter: 1 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4301" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml new file mode 100644 index 000000000..de841c92c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml @@ -0,0 +1,122 @@ +name: "ctx3_gen1_dep32_batch64_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 2 + prefill_workers: 3 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 2048 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 64 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "2253" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml new file mode 100644 index 000000000..7bf2a9332 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml @@ -0,0 +1,123 @@ +name: "ctx10_gen1_dep16_batch32_eplb0_mtp3" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 5 + prefill_workers: 10 + + decode_workers: 1 + decode_nodes: 4 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "666" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml new file mode 100644 index 000000000..09710a97d --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml @@ -0,0 +1,151 @@ +name: "ctx10_gen1_dep8_batch256_eplb0_mtp1" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 5 + prefill_workers: 10 + + decode_workers: 1 + decode_nodes: 2 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 256 + max_num_tokens: 512 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2253" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml new file mode 100644 index 000000000..61988358c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml @@ -0,0 +1,131 @@ +name: "ctx13_gen1_dep16_batch64_eplb256_mtp3" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 7 + prefill_workers: 13 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 4 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 256 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + num_slots: 256 + layer_updates_per_iter: 1 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1127" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml new file mode 100644 index 000000000..f07f607ea --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml @@ -0,0 +1,122 @@ +name: "ctx1_gen3_tep8_batch8_eplb0_mtp3" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 3 + decode_nodes: 6 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 8 + max_num_tokens: 32 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "33" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml new file mode 100644 index 000000000..be9842323 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml @@ -0,0 +1,119 @@ +name: "ctx1_gen4_tep8_batch1_eplb0_mtp3" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 4 + decode_nodes: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 1 + max_num_tokens: 4 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "5" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml new file mode 100644 index 000000000..5d45c06d3 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml @@ -0,0 +1,122 @@ +name: "ctx1_gen4_tep8_batch4_eplb0_mtp3" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 4 + decode_nodes: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 4 + max_num_tokens: 16 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 3 + - 4 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "12x24" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml new file mode 100644 index 000000000..c0c4f66e7 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml @@ -0,0 +1,119 @@ +name: "ctx4_gen1_dep32_batch4_eplb0_mtp3" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 2 + prefill_workers: 4 + + decode_workers: 1 + decode_nodes: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 4 + max_num_tokens: 16 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "180" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml new file mode 100644 index 000000000..e719310a4 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml @@ -0,0 +1,120 @@ +name: "ctx8_gen1_dep32_batch8_eplb0_mtp3" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 4 + prefill_workers: 8 + + decode_workers: 1 + decode_nodes: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 8 + max_num_tokens: 32 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "308" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml new file mode 100644 index 000000000..6b6f4a36e --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml @@ -0,0 +1,146 @@ +name: "ctx11_gen3_dep4_batch256_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 6 + prefill_workers: 11 + gpus_per_prefill: 2 + + decode_workers: 3 + decode_nodes: 3 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + + decode: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 256 + max_num_tokens: 256 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "3228" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml new file mode 100644 index 000000000..42523722e --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml @@ -0,0 +1,129 @@ +name: "ctx14_gen1_dep16_batch128_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 7 + prefill_workers: 14 + + decode_workers: 1 + decode_nodes: 4 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 128 + max_num_tokens: 128 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2253" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml new file mode 100644 index 000000000..34678b650 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml @@ -0,0 +1,117 @@ +name: "ctx1_gen3_tep8_batch16_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 3 + decode_nodes: 6 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 16 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "72" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml new file mode 100644 index 000000000..158dd4ed9 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml @@ -0,0 +1,113 @@ +name: "ctx1_gen4_tep8_batch1_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 4 + decode_nodes: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 1 + max_num_tokens: 1 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "5" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml new file mode 100644 index 000000000..f2f18332c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml @@ -0,0 +1,114 @@ +name: "ctx1_gen4_tep8_batch2_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 4 + decode_nodes: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + + decode: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 2 + max_num_tokens: 2 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "12" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml new file mode 100644 index 000000000..f380710f8 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml @@ -0,0 +1,114 @@ +name: "ctx1_gen5_tep4_batch4_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 2 + + decode_workers: 5 + decode_nodes: 5 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + + decode: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 4 + max_num_tokens: 4 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "5x15x30" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml new file mode 100644 index 000000000..8dbb94ea5 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml @@ -0,0 +1,116 @@ +name: "ctx7_gen1_dep32_batch16_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 4 + prefill_workers: 7 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + + decode: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 16 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "666" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml new file mode 100644 index 000000000..eba48a69c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml @@ -0,0 +1,122 @@ +name: "ctx9_gen1_dep16_batch64_eplb0_mtp0" + +model: + path: "dsr1" + container: "dynamo-trtllm" + precision: "fp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 5 + prefill_workers: 9 + gpus_per_prefill: 2 + + decode_workers: 1 + decode_nodes: 4 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_ENABLE_PDL: "1" + + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 64 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1229" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false + +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml new file mode 100644 index 000000000..fd4c842d5 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml @@ -0,0 +1,126 @@ +name: ctx1_gen1_dep16_batch32_eplb0_mtp3_666 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.1 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 4 + + + decode: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + enable_padding: true + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2088 + moe_config: + backend: DEEPGEMM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 16 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 100 + tensor_parallel_size: 16 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: ['666'] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml new file mode 100644 index 000000000..24cc7fcb2 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml @@ -0,0 +1,122 @@ +name: ctx1_gen1_dep32_batch4_eplb0_mtp3_180 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.1 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 4 + + + decode: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + enable_padding: true + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + max_batch_size: 4 + max_num_tokens: 16 + max_seq_len: 2088 + moe_config: + backend: DEEPGEMM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 32 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 100 + tensor_parallel_size: 32 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: ['180'] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml new file mode 100644 index 000000000..dd886c1c6 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml @@ -0,0 +1,123 @@ +name: ctx1_gen4_tep8_batch1_eplb0_mtp3_8 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 4 + decode_nodes: 8 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.1 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 4 + + + decode: + allreduce_strategy: MNNVL + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + enable_padding: true + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + max_batch_size: 1 + max_num_tokens: 4 + max_seq_len: 2088 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 8 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 100 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: ['8'] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml new file mode 100644 index 000000000..6625fde5d --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml @@ -0,0 +1,123 @@ +name: ctx1_gen4_tep8_batch4_eplb0_mtp3_24 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 4 + decode_nodes: 8 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.1 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 4 + + + decode: + allreduce_strategy: MNNVL + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + enable_padding: true + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + max_batch_size: 4 + max_num_tokens: 16 + max_seq_len: 2088 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 8 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 100 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: ['24'] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml new file mode 100644 index 000000000..14b8c83ec --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml @@ -0,0 +1,138 @@ +name: ctx2_gen1_dep16_batch128_eplb0_mtp1_2253 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb300" + prefill_nodes: 2 + prefill_workers: 2 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.1 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + tensor_parallel_size: 4 + + + decode: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + enable_padding: true + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + max_batch_size: 128 + max_num_tokens: 256 + max_seq_len: 2088 + moe_config: + backend: DEEPGEMM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 16 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + stream_interval: 100 + tensor_parallel_size: 16 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: ['2253'] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml new file mode 100644 index 000000000..30335f8e4 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml @@ -0,0 +1,124 @@ +name: ctx2_gen1_dep32_batch16_eplb0_mtp3_564 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb300" + prefill_nodes: 2 + prefill_workers: 2 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.1 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 4 + + + decode: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + enable_padding: true + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + max_batch_size: 16 + max_num_tokens: 64 + max_seq_len: 2088 + moe_config: + backend: DEEPGEMM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 32 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 100 + tensor_parallel_size: 32 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: ['564'] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml new file mode 100644 index 000000000..5985d197c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml @@ -0,0 +1,186 @@ +name: ctx3_gen2_dep8_batch512_eplb0_mtp1_8192 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb300" + prefill_nodes: 3 + prefill_workers: 3 + gpus_per_prefill: 4 + + decode_workers: 2 + decode_nodes: 4 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.1 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + tensor_parallel_size: 4 + + + decode: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + - 264 + - 272 + - 280 + - 288 + - 296 + - 304 + - 312 + - 320 + - 328 + - 336 + - 344 + - 352 + - 360 + - 368 + - 376 + - 384 + - 392 + - 400 + - 408 + - 416 + - 424 + - 432 + - 440 + - 448 + - 456 + - 464 + - 472 + - 480 + - 488 + - 496 + - 504 + - 512 + enable_padding: true + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + max_batch_size: 512 + max_num_tokens: 1024 + max_seq_len: 2088 + moe_config: + backend: DEEPGEMM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 8 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + stream_interval: 100 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: ['8192'] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml new file mode 100644 index 000000000..5d74bf4f0 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml @@ -0,0 +1,119 @@ +name: ctx1_gen4_tep8_batch16_eplb0_mtp0_84 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 4 + decode_nodes: 8 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.1 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 4 + + + decode: + allreduce_strategy: MNNVL + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + enable_padding: true + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + max_batch_size: 16 + max_num_tokens: 16 + max_seq_len: 2088 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 8 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 100 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: ['84'] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml new file mode 100644 index 000000000..9b51b74ce --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml @@ -0,0 +1,117 @@ +name: ctx1_gen4_tep8_batch1_eplb0_mtp0_4 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 4 + decode_nodes: 8 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.1 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 4 + + + decode: + allreduce_strategy: MNNVL + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + enable_padding: true + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + max_batch_size: 1 + max_num_tokens: 1 + max_seq_len: 2088 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 8 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 100 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: ['4'] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml new file mode 100644 index 000000000..bc0a9ad4a --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml @@ -0,0 +1,117 @@ +name: ctx1_gen4_tep8_batch4_eplb0_mtp0_24 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 4 + decode_nodes: 8 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.1 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 4 + + + decode: + allreduce_strategy: MNNVL + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + enable_padding: true + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + max_batch_size: 4 + max_num_tokens: 4 + max_seq_len: 2088 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 8 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 100 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: ['24'] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml new file mode 100644 index 000000000..126e651e1 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml @@ -0,0 +1,132 @@ +name: ctx2_gen1_dep16_batch128_eplb0_mtp0_2253 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb300" + prefill_nodes: 2 + prefill_workers: 2 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.1 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 4 + + + decode: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + enable_padding: true + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + max_batch_size: 128 + max_num_tokens: 128 + max_seq_len: 2088 + moe_config: + backend: DEEPGEMM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 16 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 100 + tensor_parallel_size: 16 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: ['2253'] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml new file mode 100644 index 000000000..f66062760 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml @@ -0,0 +1,120 @@ +name: ctx2_gen1_dep32_batch32_eplb0_mtp0_1229 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb300" + prefill_nodes: 2 + prefill_workers: 2 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.1 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 4 + + + decode: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + enable_padding: true + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 2088 + moe_config: + backend: DEEPGEMM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 32 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 100 + tensor_parallel_size: 32 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: ['1229'] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml new file mode 100644 index 000000000..68a326b76 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml @@ -0,0 +1,180 @@ +name: ctx3_gen2_dep8_batch512_eplb0_mtp0_8602 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb300" + prefill_nodes: 3 + prefill_workers: 3 + gpus_per_prefill: 4 + + decode_workers: 2 + decode_nodes: 4 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.1 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 4 + + + decode: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + - 264 + - 272 + - 280 + - 288 + - 296 + - 304 + - 312 + - 320 + - 328 + - 336 + - 344 + - 352 + - 360 + - 368 + - 376 + - 384 + - 392 + - 400 + - 408 + - 416 + - 424 + - 432 + - 440 + - 448 + - 456 + - 464 + - 472 + - 480 + - 488 + - 496 + - 504 + - 512 + enable_padding: true + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 512 + max_num_tokens: 512 + max_seq_len: 2088 + moe_config: + backend: DEEPGEMM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 8 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 100 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: ['8602'] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml new file mode 100644 index 000000000..8cd72351d --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml @@ -0,0 +1,212 @@ +name: ctx3_gen2_dep8_batch768_eplb0_mtp0_12288 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb300" + prefill_nodes: 3 + prefill_workers: 3 + gpus_per_prefill: 4 + + decode_workers: 2 + decode_nodes: 4 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.1 + max_batch_size: 16 + max_num_tokens: 16384 + max_seq_len: 1064 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 4 + + + decode: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + - 264 + - 272 + - 280 + - 288 + - 296 + - 304 + - 312 + - 320 + - 328 + - 336 + - 344 + - 352 + - 360 + - 368 + - 376 + - 384 + - 392 + - 400 + - 408 + - 416 + - 424 + - 432 + - 440 + - 448 + - 456 + - 464 + - 472 + - 480 + - 488 + - 496 + - 504 + - 512 + - 520 + - 528 + - 536 + - 544 + - 552 + - 560 + - 568 + - 576 + - 584 + - 592 + - 600 + - 608 + - 616 + - 624 + - 632 + - 640 + - 648 + - 656 + - 664 + - 672 + - 680 + - 688 + - 696 + - 704 + - 712 + - 720 + - 728 + - 736 + - 744 + - 752 + - 760 + - 768 + enable_padding: true + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 768 + max_num_tokens: 768 + max_seq_len: 2088 + moe_config: + backend: DEEPGEMM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 8 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 100 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: ['12288'] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml new file mode 100644 index 000000000..6123b194f --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml @@ -0,0 +1,130 @@ +name: ctx10_gen1_dep16_batch64_eplb0_mtp1_1229 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb300" + prefill_nodes: 10 + prefill_workers: 10 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.1 + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + tensor_parallel_size: 4 + + + decode: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + enable_padding: true + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + max_batch_size: 64 + max_num_tokens: 128 + max_seq_len: 9256 + moe_config: + backend: DEEPGEMM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 16 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + stream_interval: 100 + tensor_parallel_size: 16 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: ['1229'] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml new file mode 100644 index 000000000..3c61eca96 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml @@ -0,0 +1,123 @@ +name: ctx1_gen4_tep8_batch1_eplb0_mtp3_8 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 4 + decode_nodes: 8 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.1 + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 4 + + + decode: + allreduce_strategy: MNNVL + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + enable_padding: true + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 1 + max_num_tokens: 4 + max_seq_len: 9256 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 8 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 100 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: ['8'] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml new file mode 100644 index 000000000..539a3f780 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml @@ -0,0 +1,123 @@ +name: ctx1_gen4_tep8_batch4_eplb0_mtp3_24 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 4 + decode_nodes: 8 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.1 + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 4 + + + decode: + allreduce_strategy: MNNVL + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + enable_padding: true + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 4 + max_num_tokens: 16 + max_seq_len: 9256 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 8 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 100 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: ['24'] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml new file mode 100644 index 000000000..49e94caa5 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml @@ -0,0 +1,123 @@ +name: ctx6_gen1_dep32_batch8_eplb0_mtp3_333 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb300" + prefill_nodes: 6 + prefill_workers: 6 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.1 + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 4 + + + decode: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + - 8 + enable_padding: true + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + max_batch_size: 8 + max_num_tokens: 32 + max_seq_len: 9256 + moe_config: + backend: DEEPGEMM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 32 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 100 + tensor_parallel_size: 32 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: ['333'] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml new file mode 100644 index 000000000..e531467ca --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml @@ -0,0 +1,138 @@ +name: ctx7_gen1_dep8_batch128_eplb0_mtp1_1229 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb300" + prefill_nodes: 7 + prefill_workers: 7 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 2 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.1 + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + tensor_parallel_size: 4 + + + decode: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + enable_padding: true + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 128 + max_num_tokens: 256 + max_seq_len: 9256 + moe_config: + backend: DEEPGEMM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 8 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + stream_interval: 100 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: ['1229'] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml new file mode 100644 index 000000000..fadb3c8c1 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml @@ -0,0 +1,126 @@ +name: ctx8_gen1_dep16_batch32_eplb0_mtp3_666 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb300" + prefill_nodes: 8 + prefill_workers: 8 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.1 + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + tensor_parallel_size: 4 + + + decode: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + enable_padding: true + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 9256 + moe_config: + backend: DEEPGEMM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 16 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + stream_interval: 100 + tensor_parallel_size: 16 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: ['666'] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml new file mode 100644 index 000000000..30ba58dcd --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml @@ -0,0 +1,117 @@ +name: ctx1_gen4_tep8_batch1_eplb0_mtp0_4 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 4 + decode_nodes: 8 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.1 + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 4 + + + decode: + allreduce_strategy: MNNVL + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + enable_padding: true + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + max_batch_size: 1 + max_num_tokens: 1 + max_seq_len: 9256 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 8 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 100 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: ['4'] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml new file mode 100644 index 000000000..091164082 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml @@ -0,0 +1,117 @@ +name: ctx1_gen4_tep8_batch4_eplb0_mtp0_24 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 4 + decode_nodes: 8 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.1 + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 4 + + + decode: + allreduce_strategy: MNNVL + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + enable_padding: true + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + max_batch_size: 4 + max_num_tokens: 4 + max_seq_len: 9256 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 8 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 100 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: ['24'] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml new file mode 100644 index 000000000..de8d408d1 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml @@ -0,0 +1,118 @@ +name: ctx1_gen4_tep8_batch8_eplb0_mtp0_36 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + + decode_workers: 4 + decode_nodes: 8 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.1 + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 4 + + + decode: + allreduce_strategy: MNNVL + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + - 8 + enable_padding: true + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + max_batch_size: 8 + max_num_tokens: 8 + max_seq_len: 9256 + moe_config: + backend: TRTLLM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 8 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 100 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: ['36'] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml new file mode 100644 index 000000000..70aade3de --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml @@ -0,0 +1,120 @@ +name: ctx4_gen1_dep16_batch32_eplb0_mtp0_666 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb300" + prefill_nodes: 4 + prefill_workers: 4 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.1 + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 4 + + + decode: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + enable_padding: true + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 9256 + moe_config: + backend: DEEPGEMM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 16 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 100 + tensor_parallel_size: 16 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: ['666'] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml new file mode 100644 index 000000000..cfe8dead6 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml @@ -0,0 +1,118 @@ +name: ctx6_gen1_dep32_batch16_eplb0_mtp0_512 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb300" + prefill_nodes: 6 + prefill_workers: 6 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 8 + gpus_per_decode: 32 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.1 + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 4 + + + decode: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + enable_padding: true + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + max_batch_size: 16 + max_num_tokens: 16 + max_seq_len: 9256 + moe_config: + backend: DEEPGEMM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 32 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 100 + tensor_parallel_size: 32 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: ['512'] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml new file mode 100644 index 000000000..97745e8c8 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml @@ -0,0 +1,124 @@ +name: ctx7_gen1_dep16_batch64_eplb0_mtp0_1229 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb300" + prefill_nodes: 7 + prefill_workers: 7 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 4 + gpus_per_decode: 16 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.1 + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 4 + + + decode: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + enable_padding: true + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 64 + max_num_tokens: 64 + max_seq_len: 9256 + moe_config: + backend: DEEPGEMM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 16 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 100 + tensor_parallel_size: 16 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: ['1229'] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml new file mode 100644 index 000000000..09e23abed --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml @@ -0,0 +1,148 @@ +name: ctx7_gen1_dep8_batch256_eplb0_mtp0_2151 + +model: + path: "dsr1-fp8" + container: "dynamo-trtllm" + precision: "fp8" + +resources: + gpu_type: "gb300" + prefill_nodes: 7 + prefill_workers: 7 + gpus_per_prefill: 4 + + decode_workers: 1 + decode_nodes: 2 + gpus_per_decode: 8 + + gpus_per_node: 4 + +backend: + type: trtllm + + prefill_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + TLLM_OVERRIDE_LAYER_NUM: "61" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + TRTLLM_ENABLE_PDL: "1" + ENROOT_ALLOW_DEV: "yes" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED" + ENABLE_CONFIGURABLE_MOE: "1" + + trtllm_config: + prefill: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: null + disable_overlap_scheduler: true + enable_attention_dp: true + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.1 + max_batch_size: 2 + max_num_tokens: 16384 + max_seq_len: 8232 + moe_config: + backend: DEEPGEMM + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + print_iter_log: true + tensor_parallel_size: 4 + + + decode: + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + cuda_graph_config: + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + enable_padding: true + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + max_batch_size: 256 + max_num_tokens: 256 + max_seq_len: 9256 + moe_config: + backend: DEEPGEMM + use_low_precision_moe_combine: true + moe_expert_parallel_size: 8 + num_postprocess_workers: 4 + pipeline_parallel_size: 1 + print_iter_log: true + stream_interval: 100 + tensor_parallel_size: 8 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: ['2151'] + req_rate: "inf" + +frontend: + type: "dynamo" + + enable_multiple_frontends: false + + +health_check: + max_attempts: 360 + interval_seconds: 10 + +dynamo: + install: false +infra: + etcd_nats_dedicated_node: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml new file mode 100644 index 000000000..104f3b4ab --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml @@ -0,0 +1,105 @@ +name: h100_1k1k_ctx1dep16_gen1dep16_batch32_eplb0_mtp2_chunked_false +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 1 + decode_nodes: 2 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + decode_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '615' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml new file mode 100644 index 000000000..4c41ec82a --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml @@ -0,0 +1,109 @@ +name: h100_1k1k_ctx1dep16_gen1dep16_batch64_eplb0_mtp1_chunked_false +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 1 + decode_nodes: 2 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + decode_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '1229' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml new file mode 100644 index 000000000..c3dc14082 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml @@ -0,0 +1,101 @@ +name: h100_1k1k_ctx1dep16_gen3dep16_batch4_eplb0_mtp3_chunked_false +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + decode_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 4 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '231' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml new file mode 100644 index 000000000..8f3663c94 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml @@ -0,0 +1,114 @@ +name: h100_1k1k_ctx1dep16_gen3tep16_batch128_eplb0_mtp3_chunked_false +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 128 + max_num_tokens: 512 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '462' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml new file mode 100644 index 000000000..bd77671ac --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml @@ -0,0 +1,100 @@ +name: h100_1k1k_ctx1dep16_gen3tep16_batch16_eplb0_mtp3_chunked_false +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '60' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml new file mode 100644 index 000000000..c1fccbc9d --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml @@ -0,0 +1,98 @@ +name: h100_1k1k_ctx1dep16_gen3tep16_batch1_eplb0_mtp3_chunked_false +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 1 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '6' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml new file mode 100644 index 000000000..15c71e8d3 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml @@ -0,0 +1,98 @@ +name: h100_1k1k_ctx1dep16_gen3tep16_batch2_eplb0_mtp3_chunked_false +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 2 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '9' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml new file mode 100644 index 000000000..4f261058e --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml @@ -0,0 +1,102 @@ +name: h100_1k1k_ctx1dep16_gen3tep16_batch32_eplb0_mtp3_chunked_false +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '117' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml new file mode 100644 index 000000000..07de7a34d --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml @@ -0,0 +1,99 @@ +name: h100_1k1k_ctx1dep16_gen3tep16_batch8_eplb0_mtp3_chunked_false +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 8 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '30' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml new file mode 100644 index 000000000..4a55e5ed8 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml @@ -0,0 +1,97 @@ +name: ctx1dep16_gen3dep16_batch16_eplb0_mtp0 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + decode_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: true + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '924' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml new file mode 100644 index 000000000..2bedf4c23 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml @@ -0,0 +1,99 @@ +name: ctx1dep16_gen3dep16_batch32_eplb0_mtp0 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + decode_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '1845' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml new file mode 100644 index 000000000..1ff9ace49 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml @@ -0,0 +1,95 @@ +name: ctx1dep16_gen3dep16_batch4_eplb0_mtp0 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + decode_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 4 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '231' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml new file mode 100644 index 000000000..215e8a6bf --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml @@ -0,0 +1,96 @@ +name: ctx1dep16_gen3dep16_batch8_eplb0_mtp0 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + decode_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 8 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '462' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml new file mode 100644 index 000000000..4281abed2 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml @@ -0,0 +1,94 @@ +name: ctx1dep16_gen3tep16_batch16_eplb0_mtp0 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '60' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml new file mode 100644 index 000000000..a0e0005e8 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml @@ -0,0 +1,92 @@ +name: ctx1dep16_gen3tep16_batch1_eplb0_mtp0 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: true + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 1 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '6' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml new file mode 100644 index 000000000..6eee90d2d --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml @@ -0,0 +1,92 @@ +name: ctx1dep16_gen3tep16_batch2_eplb0_mtp0 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: true + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 2 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '9' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml new file mode 100644 index 000000000..29e634316 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml @@ -0,0 +1,93 @@ +name: ctx1dep16_gen3tep16_batch8_eplb0_mtp0 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: true + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 8 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '30' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml new file mode 100644 index 000000000..bb02cdd0a --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml @@ -0,0 +1,127 @@ +name: ctx2dep16_gen1dep16_batch256_eplb0_mtp0 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 2 + prefill_nodes: 4 + decode_workers: 1 + decode_nodes: 2 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + decode_environment: + UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 2 + max_num_tokens: 2048 + max_seq_len: 2048 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: true + moe_config: + backend: WIDEEP + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 256 + max_num_tokens: 256 + max_seq_len: 2088 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8192 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 +benchmark: + type: sa-bench + isl: 1024 + osl: 1024 + concurrencies: '4916' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml new file mode 100644 index 000000000..b78cb01af --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml @@ -0,0 +1,101 @@ +name: h100_8k1k_ctx1dep16_gen1dep16_batch4_eplb0_mtp3 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 1 + decode_nodes: 2 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + trtllm_config: + prefill: + max_batch_size: 1 + max_num_tokens: 8224 + max_seq_len: 8232 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: true + moe_config: + backend: WIDEEP + max_num_tokens: 16384 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8256 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 4 + max_num_tokens: 128 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8256 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + concurrencies: '77' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml new file mode 100644 index 000000000..dd0ddda85 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml @@ -0,0 +1,103 @@ +name: h100_8k1k_ctx1dep16_gen2tep16_batch32_eplb0_mtp3 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 2 + decode_nodes: 4 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 1 + max_num_tokens: 8224 + max_seq_len: 8232 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + max_num_tokens: 16384 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8256 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 256 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8256 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + concurrencies: '78' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml new file mode 100644 index 000000000..2f0ef4e90 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml @@ -0,0 +1,99 @@ +name: h100_8k1k_ctx1dep16_gen3tep16_batch1_eplb0_mtp3 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 1 + max_num_tokens: 8224 + max_seq_len: 8232 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: true + moe_config: + backend: WIDEEP + max_num_tokens: 16384 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8256 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 1 + max_num_tokens: 256 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8256 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + concurrencies: '6' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml new file mode 100644 index 000000000..be3fc74ce --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml @@ -0,0 +1,99 @@ +name: h100_8k1k_ctx1dep16_gen3tep16_batch2_eplb0_mtp3 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 1 + max_num_tokens: 8224 + max_seq_len: 8232 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: true + moe_config: + backend: WIDEEP + max_num_tokens: 16384 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8256 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 2 + max_num_tokens: 256 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8256 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + concurrencies: '9' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml new file mode 100644 index 000000000..6a710bbb5 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml @@ -0,0 +1,100 @@ +name: h100_8k1k_ctx1dep16_gen3tep16_batch8_eplb0_mtp3 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + UCX_CUDA_IPC_ENABLE_MNNVL: n + trtllm_config: + prefill: + max_batch_size: 1 + max_num_tokens: 8224 + max_seq_len: 8232 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + max_num_tokens: 16384 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8256 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 8 + max_num_tokens: 256 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8256 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + concurrencies: '30' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml new file mode 100644 index 000000000..4d746af13 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml @@ -0,0 +1,102 @@ +name: h100_8k1k_ctx2dep16_gen1dep16_batch8_eplb0_mtp3 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 2 + prefill_nodes: 4 + decode_workers: 1 + decode_nodes: 2 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + trtllm_config: + prefill: + max_batch_size: 1 + max_num_tokens: 8224 + max_seq_len: 8232 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: true + moe_config: + backend: WIDEEP + max_num_tokens: 16384 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8256 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 8 + max_num_tokens: 128 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8256 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + concurrencies: '154' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml new file mode 100644 index 000000000..2f630277e --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml @@ -0,0 +1,104 @@ + + +name: "h100_8k1k_ctx1dep16_gen2tep16_batch64_eplb0_mtp0" + +model: + path: "DeepSeek-R1-0528" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: "fp8" + +resources: + gpu_type: "h100" + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 2 + decode_nodes: 4 + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: "1" + TRTLLM_FORCE_ALLTOALL_METHOD: "DeepEP" + + decode_environment: + NCCL_NVLS_ENABLE: "0" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + + trtllm_config: + prefill: + max_batch_size: 1 + max_num_tokens: 8224 + max_seq_len: 8232 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + max_num_tokens: 16384 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8256 + + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 256 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64] + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8256 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "154" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false # There are errors about colliding on port 8080, and others. + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml new file mode 100644 index 000000000..9081201ba --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml @@ -0,0 +1,94 @@ +name: h100_8k1k_ctx1dep16_gen3tep16_batch1_eplb0_mtp0 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + trtllm_config: + prefill: + max_batch_size: 1 + max_num_tokens: 8224 + max_seq_len: 8232 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + max_num_tokens: 16384 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8256 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 1 + max_num_tokens: 256 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8256 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + concurrencies: '6' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml new file mode 100644 index 000000000..938fd965c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml @@ -0,0 +1,104 @@ + + +name: "h100_8k1k_ctx1dep16_gen3tep16_batch2_eplb0_mtp0" + +model: + path: "DeepSeek-R1-0528" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: "fp8" + +resources: + gpu_type: "h100" + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: "1" + TRTLLM_FORCE_ALLTOALL_METHOD: "DeepEP" + + decode_environment: + NCCL_NVLS_ENABLE: "0" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + + trtllm_config: + prefill: + max_batch_size: 1 + max_num_tokens: 8224 + max_seq_len: 8232 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + max_num_tokens: 16384 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8256 + + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 2 + max_num_tokens: 256 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: [1, 2, 4] + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8256 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "9" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false # There are errors about colliding on port 8080, and others. + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml new file mode 100644 index 000000000..c1eb86c19 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml @@ -0,0 +1,104 @@ + + +name: "h100_8k1k_ctx1dep16_gen3tep16_batch8_eplb0_mtp0" + +model: + path: "DeepSeek-R1-0528" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: "fp8" + +resources: + gpu_type: "h100" + prefill_workers: 1 + prefill_nodes: 2 + decode_workers: 3 + decode_nodes: 6 + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: "1" + TRTLLM_FORCE_ALLTOALL_METHOD: "DeepEP" + + decode_environment: + NCCL_NVLS_ENABLE: "0" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + TLLM_LOG_LEVEL: "INFO" + UCX_CUDA_IPC_ENABLE_MNNVL: "n" + + trtllm_config: + prefill: + max_batch_size: 1 + max_num_tokens: 8224 + max_seq_len: 8232 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: true + moe_config: + backend: WIDEEP + max_num_tokens: 16384 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8256 + + + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 8 + max_num_tokens: 256 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: [1, 2, 4, 8] + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8256 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "30" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false # There are errors about colliding on port 8080, and others. + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml new file mode 100644 index 000000000..40c84770f --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml @@ -0,0 +1,97 @@ +name: h100_8k1k_ctx2dep16_gen1dep16_batch16_eplb0_mtp0 +model: + path: DeepSeek-R1-0528 + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3" + precision: fp8 +resources: + gpu_type: h100 + prefill_workers: 2 + prefill_nodes: 4 + decode_workers: 1 + decode_nodes: 2 + gpus_per_node: 8 +backend: + type: trtllm + prefill_environment: + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + decode_environment: + NCCL_NVLS_ENABLE: '0' + UCX_CUDA_IPC_ENABLE_MNNVL: n + TRTLLM_ENABLE_PDL: '1' + TRTLLM_SERVER_DISABLE_GC: '1' + TRTLLM_WORKER_DISABLE_GC: '1' + NCCL_GRAPH_MIXING_SUPPORT: '0' + TLLM_LOG_LEVEL: INFO + TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1' + TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP + trtllm_config: + prefill: + max_batch_size: 1 + max_num_tokens: 8224 + max_seq_len: 8232 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + enable_chunked_prefill: false + moe_config: + backend: WIDEEP + max_num_tokens: 16384 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8256 + decode: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 128 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8256 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + concurrencies: '308' + req_rate: inf +frontend: + type: dynamo + enable_multiple_frontends: false +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml new file mode 100644 index 000000000..7c3fc7c0e --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml @@ -0,0 +1,107 @@ +name: "c128_ctx1_gen7_dep8_batch128_eplb0_mtp3" + +model: + path: "dsr1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: "fp8" + +sbatch_directives: + cpus-per-gpu: "16" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + decode_workers: 7 + decode_nodes: 7 + gpus_per_node: 8 + +backend: + type: trtllm + prefill_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + decode_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + trtllm_config: + prefill: + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_chunked_prefill: false + max_batch_size: 8 + max_num_tokens: 8192 + max_seq_len: 1064 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + moe_config: + backend: CUTLASS + cuda_graph_config: null + disable_overlap_scheduler: true + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + enable_chunked_prefill: false + max_batch_size: 128 + max_num_tokens: 512 + max_seq_len: 2088 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cuda_graph_config: + enable_padding: true + batch_sizes: [1,2,4,8,16,24,32,40,48,56,64,72,80,88,96,104,112,120,128] + disable_overlap_scheduler: false + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "896" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml new file mode 100644 index 000000000..4feb8690d --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml @@ -0,0 +1,137 @@ +name: "c16_ctx1_gen9_tep8_batch128_eplb0_mtp3" + +model: + path: "dsr1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: "fp8" + +sbatch_directives: + cpus-per-gpu: "16" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + + decode_workers: 9 + decode_nodes: 9 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + trtllm_config: + prefill: + # Prefill Worker Config for Dynamo DSR1 (MTP mode) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_chunked_prefill: false + max_batch_size: 8 + max_num_tokens: 8192 + max_seq_len: 1064 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + moe_config: + backend: CUTLASS + cuda_graph_config: null + disable_overlap_scheduler: true + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + # Decode Worker Config for Dynamo DSR1 (MTP c=16) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + enable_chunked_prefill: false + max_batch_size: 128 + max_num_tokens: 512 + max_seq_len: 2088 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + disable_overlap_scheduler: false + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "144" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx. + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml new file mode 100644 index 000000000..522618223 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml @@ -0,0 +1,117 @@ +name: "c1_ctx1_gen11_tep8_batch1_eplb0_mtp3" + +model: + path: "dsr1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: "fp8" + +sbatch_directives: + cpus-per-gpu: "16" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + + decode_workers: 11 + decode_nodes: 11 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + trtllm_config: + prefill: + # Prefill Worker Config for Dynamo DSR1 (MTP mode, aggressive ctx:gen 1:11 for c=4) + # ISL/OSL: 1k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_chunked_prefill: false + max_batch_size: 8 + max_num_tokens: 8192 + max_seq_len: 1064 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + moe_config: + backend: CUTLASS + cuda_graph_config: null + disable_overlap_scheduler: true + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + # Decode Worker Config for Dynamo DSR1 (MTP c=4, TEP mode) + # ISL/OSL: 1k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + enable_chunked_prefill: false + max_batch_size: 1 + max_num_tokens: 4 + max_seq_len: 2088 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + disable_overlap_scheduler: false + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "13" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml new file mode 100644 index 000000000..5be701be2 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml @@ -0,0 +1,107 @@ +name: "c256_ctx1_gen4_dep8_batch128_eplb0_mtp3" + +model: + path: "dsr1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: "fp8" + +sbatch_directives: + cpus-per-gpu: "16" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + decode_workers: 4 + decode_nodes: 4 + gpus_per_node: 8 + +backend: + type: trtllm + prefill_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + decode_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + trtllm_config: + prefill: + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_chunked_prefill: false + max_batch_size: 8 + max_num_tokens: 8192 + max_seq_len: 1064 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + moe_config: + backend: CUTLASS + cuda_graph_config: null + disable_overlap_scheduler: true + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + enable_chunked_prefill: false + max_batch_size: 128 + max_num_tokens: 512 + max_seq_len: 2088 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cuda_graph_config: + enable_padding: true + batch_sizes: [1,2,4,8,16,24,32,40,48,56,64,72,80,88,96,104,112,120,128] + disable_overlap_scheduler: false + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1024" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml new file mode 100644 index 000000000..6e8464280 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml @@ -0,0 +1,107 @@ +name: "c32_ctx1_gen11_tep8_batch128_eplb0_mtp3" + +model: + path: "dsr1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: "fp8" + +sbatch_directives: + cpus-per-gpu: "16" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + decode_workers: 11 + decode_nodes: 11 + gpus_per_node: 8 + +backend: + type: trtllm + prefill_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + decode_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + trtllm_config: + prefill: + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_chunked_prefill: false + max_batch_size: 8 + max_num_tokens: 8192 + max_seq_len: 1064 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + moe_config: + backend: CUTLASS + cuda_graph_config: null + disable_overlap_scheduler: true + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + enable_chunked_prefill: false + max_batch_size: 128 + max_num_tokens: 512 + max_seq_len: 2088 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cuda_graph_config: + enable_padding: true + batch_sizes: [1,2,4,8,16,24,32,40,48,56,64,72,80,88,96,104,112,120,128] + disable_overlap_scheduler: false + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "352" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml new file mode 100644 index 000000000..69f96bac7 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml @@ -0,0 +1,135 @@ +name: "c4_ctx1_gen11_tep8_batch128_eplb0_mtp3" + +model: + path: "dsr1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: "fp8" + +sbatch_directives: + cpus-per-gpu: "16" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + + decode_workers: 11 + decode_nodes: 11 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + trtllm_config: + prefill: + # Prefill Worker Config for Dynamo DSR1 (MTP mode, aggressive ctx:gen 1:11 for c=4) + # ISL/OSL: 1k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_chunked_prefill: false + max_batch_size: 8 + max_num_tokens: 8192 + max_seq_len: 1064 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + moe_config: + backend: CUTLASS + cuda_graph_config: null + disable_overlap_scheduler: true + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + # Decode Worker Config for Dynamo DSR1 (MTP c=4, TEP mode) + # ISL/OSL: 1k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + enable_chunked_prefill: false + max_batch_size: 128 + max_num_tokens: 512 + max_seq_len: 2088 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + disable_overlap_scheduler: false + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "44" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml new file mode 100644 index 000000000..a7275865f --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml @@ -0,0 +1,153 @@ +name: "c512_ctx1_gen2_dep8_batch256_eplb0_mtp1" + +model: + path: "dsr1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: "fp8" + +sbatch_directives: + cpus-per-gpu: "16" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + + decode_workers: 2 + decode_nodes: 2 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + trtllm_config: + prefill: + # Prefill Worker Config for Dynamo DSR1 (MTP mode) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_chunked_prefill: false + max_batch_size: 8 + max_num_tokens: 8192 + max_seq_len: 1064 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + moe_config: + backend: CUTLASS + cuda_graph_config: null + disable_overlap_scheduler: true + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + decode: + # Decode Worker Config for Dynamo DSR1 (MTP c=512) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + enable_chunked_prefill: false + max_batch_size: 256 + max_num_tokens: 512 + max_seq_len: 2088 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + disable_overlap_scheduler: false + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1024" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx. + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml new file mode 100644 index 000000000..b68aae478 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml @@ -0,0 +1,137 @@ +name: "c64_ctx1_gen8_dep8_batch128_eplb0_mtp3" + +model: + path: "dsr1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: "fp8" + +sbatch_directives: + cpus-per-gpu: "16" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + + decode_workers: 8 + decode_nodes: 8 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + trtllm_config: + prefill: + # Prefill Worker Config for Dynamo DSR1 (MTP mode) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_chunked_prefill: false + max_batch_size: 8 + max_num_tokens: 8192 + max_seq_len: 1064 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + moe_config: + backend: CUTLASS + cuda_graph_config: null + disable_overlap_scheduler: true + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + # Decode Worker Config for Dynamo DSR1 (MTP c=64) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + enable_chunked_prefill: false + max_batch_size: 128 + max_num_tokens: 512 + max_seq_len: 2088 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + disable_overlap_scheduler: false + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "512" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx. + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml new file mode 100644 index 000000000..506a8c580 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml @@ -0,0 +1,107 @@ +name: "c8_ctx1_gen11_tep8_batch128_eplb0_mtp3" + +model: + path: "dsr1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: "fp8" + +sbatch_directives: + cpus-per-gpu: "16" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + decode_workers: 11 + decode_nodes: 11 + gpus_per_node: 8 + +backend: + type: trtllm + prefill_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + decode_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + trtllm_config: + prefill: + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_chunked_prefill: false + max_batch_size: 8 + max_num_tokens: 8192 + max_seq_len: 1064 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + moe_config: + backend: CUTLASS + cuda_graph_config: null + disable_overlap_scheduler: true + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + enable_chunked_prefill: false + max_batch_size: 128 + max_num_tokens: 512 + max_seq_len: 2088 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cuda_graph_config: + enable_padding: true + batch_sizes: [1,2,4,8,16,24,32,40,48,56,64,72,80,88,96,104,112,120,128] + disable_overlap_scheduler: false + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "88" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml new file mode 100644 index 000000000..5d910619d --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml @@ -0,0 +1,182 @@ +name: "c128_ctx1_gen9_dep8_batch512_eplb0_mtp0" + +model: + path: "dsr1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: "fp8" + +sbatch_directives: + cpus-per-gpu: "16" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + + decode_workers: 9 + decode_nodes: 9 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + trtllm_config: + prefill: + # Prefill Worker Config for Dynamo DSR1 (DEP mode) + # ISL/OSL: 8k/1k, TP=8 on H200 + # Matches E2E standalone ctx_config.yaml + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_chunked_prefill: false + max_batch_size: 8 + max_num_tokens: 8192 + max_seq_len: 1064 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + moe_config: + backend: CUTLASS + cuda_graph_config: null + disable_overlap_scheduler: true + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + + decode: + # Decode Worker Config for Dynamo DSR1 (DEP mode) + # ISL/OSL: 8k/1k, TP=8 on H200 + # Matches E2E standalone gen_config.yaml (DEP c=128) + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + enable_chunked_prefill: false + max_batch_size: 512 + max_num_tokens: 512 + max_seq_len: 2088 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + - 264 + - 272 + - 280 + - 288 + - 296 + - 304 + - 312 + - 320 + - 328 + - 336 + - 344 + - 352 + - 360 + - 368 + - 376 + - 384 + - 392 + - 400 + - 408 + - 416 + - 424 + - 432 + - 440 + - 448 + - 456 + - 464 + - 472 + - 480 + - 488 + - 496 + - 504 + - 512 + disable_overlap_scheduler: false + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1152" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx. + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml new file mode 100644 index 000000000..a11789b29 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml @@ -0,0 +1,147 @@ +name: "c16_ctx1_gen9_tep8_batch256_eplb0_mtp0" + +model: + path: "dsr1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: "fp8" + +sbatch_directives: + cpus-per-gpu: "16" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + + decode_workers: 9 + decode_nodes: 9 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + trtllm_config: + prefill: + # Prefill Worker Config for Dynamo DSR1 (TEP mode) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_chunked_prefill: false + max_batch_size: 8 + max_num_tokens: 8192 + max_seq_len: 1064 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + moe_config: + backend: CUTLASS + cuda_graph_config: null + disable_overlap_scheduler: true + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + decode: + # Decode Worker Config for Dynamo DSR1 (TEP c=16) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + enable_chunked_prefill: false + max_batch_size: 256 + max_num_tokens: 256 + max_seq_len: 2088 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + disable_overlap_scheduler: false + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "144" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx. + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml new file mode 100644 index 000000000..554f516e2 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml @@ -0,0 +1,113 @@ +name: "c1_ctx1_gen9_tep8_batch1_eplb0_mtp0" + +model: + path: "dsr1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: "fp8" + +sbatch_directives: + cpus-per-gpu: "16" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + + decode_workers: 9 + decode_nodes: 9 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + trtllm_config: + prefill: + # Prefill Worker Config for Dynamo DSR1 (TEP mode) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_chunked_prefill: false + max_batch_size: 8 + max_num_tokens: 8192 + max_seq_len: 1064 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + moe_config: + backend: CUTLASS + cuda_graph_config: null + disable_overlap_scheduler: true + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + decode: + # Decode Worker Config for Dynamo DSR1 (TEP c=16) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + enable_chunked_prefill: false + max_batch_size: 1 + max_num_tokens: 1 + max_seq_len: 2088 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + disable_overlap_scheduler: false + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "11" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx. + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml new file mode 100644 index 000000000..c48eded81 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml @@ -0,0 +1,101 @@ +name: "c256_ctx1_gen6_dep8_batch512_eplb0_mtp0" + +model: + path: "dsr1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: "fp8" + +sbatch_directives: + cpus-per-gpu: "16" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + decode_workers: 6 + decode_nodes: 6 + gpus_per_node: 8 + +backend: + type: trtllm + prefill_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + decode_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + trtllm_config: + prefill: + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_chunked_prefill: false + max_batch_size: 8 + max_num_tokens: 8192 + max_seq_len: 1064 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + moe_config: + backend: CUTLASS + cuda_graph_config: null + disable_overlap_scheduler: true + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + decode: + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + enable_chunked_prefill: false + max_batch_size: 512 + max_num_tokens: 512 + max_seq_len: 2088 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cuda_graph_config: + enable_padding: true + batch_sizes: [1,2,4,8,16,24,32,40,48,56,64,72,80,88,96,104,112,120,128,136,144,152,160,168,176,184,192,200,208,216,224,232,240,248,256,264,272,280,288,296,304,312,320,328,336,344,352,360,368,376,384,392,400,408,416,424,432,440,448,456,464,472,480,488,496,504,512] + disable_overlap_scheduler: false + print_iter_log: true + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1536" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml new file mode 100644 index 000000000..473753df3 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml @@ -0,0 +1,147 @@ +name: "c32_ctx1_gen9_tep8_batch256_eplb0_mtp0" + +model: + path: "dsr1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: "fp8" + +sbatch_directives: + cpus-per-gpu: "16" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + + decode_workers: 9 + decode_nodes: 9 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + trtllm_config: + prefill: + # Prefill Worker Config for Dynamo DSR1 (TEP mode) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_chunked_prefill: false + max_batch_size: 8 + max_num_tokens: 8192 + max_seq_len: 1064 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + moe_config: + backend: CUTLASS + cuda_graph_config: null + disable_overlap_scheduler: true + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + decode: + # Decode Worker Config for Dynamo DSR1 (TEP c=16) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + enable_chunked_prefill: false + max_batch_size: 256 + max_num_tokens: 256 + max_seq_len: 2088 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + disable_overlap_scheduler: false + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "288" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx. + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml new file mode 100644 index 000000000..80784e19d --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml @@ -0,0 +1,147 @@ +name: "c4_ctx1_gen9_tep8_batch256_eplb0_mtp0" + +model: + path: "dsr1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: "fp8" + +sbatch_directives: + cpus-per-gpu: "16" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + + decode_workers: 9 + decode_nodes: 9 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + trtllm_config: + prefill: + # Prefill Worker Config for Dynamo DSR1 (TEP mode) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_chunked_prefill: false + max_batch_size: 8 + max_num_tokens: 8192 + max_seq_len: 1064 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + moe_config: + backend: CUTLASS + cuda_graph_config: null + disable_overlap_scheduler: true + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + decode: + # Decode Worker Config for Dynamo DSR1 (TEP c=16) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + enable_chunked_prefill: false + max_batch_size: 256 + max_num_tokens: 256 + max_seq_len: 2088 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + disable_overlap_scheduler: false + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "36" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx. + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml new file mode 100644 index 000000000..7c695e47f --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml @@ -0,0 +1,182 @@ +name: "c512_ctx2_gen7_dep8_batch512_eplb0_mtp0" + +model: + path: "dsr1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: "fp8" + +sbatch_directives: + cpus-per-gpu: "16" + +resources: + gpu_type: "h200" + prefill_nodes: 2 + prefill_workers: 2 + + decode_workers: 7 + decode_nodes: 7 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + trtllm_config: + prefill: + # Prefill Worker Config for Dynamo DSR1 (DEP mode) + # ISL/OSL: 8k/1k, TP=8 on H200 + # Matches E2E standalone ctx_config.yaml + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_chunked_prefill: false + max_batch_size: 8 + max_num_tokens: 8192 + max_seq_len: 1064 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + moe_config: + backend: CUTLASS + cuda_graph_config: null + disable_overlap_scheduler: true + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + + decode: + # Decode Worker Config for Dynamo DSR1 (DEP mode) + # ISL/OSL: 8k/1k, TP=8 on H200 + # Matches E2E standalone gen_config.yaml (DEP c=128) + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + enable_chunked_prefill: false + max_batch_size: 512 + max_num_tokens: 512 + max_seq_len: 2088 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + - 264 + - 272 + - 280 + - 288 + - 296 + - 304 + - 312 + - 320 + - 328 + - 336 + - 344 + - 352 + - 360 + - 368 + - 376 + - 384 + - 392 + - 400 + - 408 + - 416 + - 424 + - 432 + - 440 + - 448 + - 456 + - 464 + - 472 + - 480 + - 488 + - 496 + - 504 + - 512 + disable_overlap_scheduler: false + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "3584" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx. + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml new file mode 100644 index 000000000..69d7b8708 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml @@ -0,0 +1,147 @@ +name: "c64_ctx1_gen9_tep8_batch256_eplb0_mtp0" + +model: + path: "dsr1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: "fp8" + +sbatch_directives: + cpus-per-gpu: "16" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + + decode_workers: 9 + decode_nodes: 9 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + trtllm_config: + prefill: + # Prefill Worker Config for Dynamo DSR1 (TEP mode) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_chunked_prefill: false + max_batch_size: 8 + max_num_tokens: 8192 + max_seq_len: 1064 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + moe_config: + backend: CUTLASS + cuda_graph_config: null + disable_overlap_scheduler: true + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + decode: + # Decode Worker Config for Dynamo DSR1 (TEP c=16) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + enable_chunked_prefill: false + max_batch_size: 256 + max_num_tokens: 256 + max_seq_len: 2088 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + disable_overlap_scheduler: false + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "576" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx. + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml new file mode 100644 index 000000000..0c1828f27 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml @@ -0,0 +1,147 @@ +name: "c8_ctx1_gen9_tep8_batch256_eplb0_mtp0" + +model: + path: "dsr1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: "fp8" + +sbatch_directives: + cpus-per-gpu: "16" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + + decode_workers: 9 + decode_nodes: 9 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + trtllm_config: + prefill: + # Prefill Worker Config for Dynamo DSR1 (TEP mode) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_chunked_prefill: false + max_batch_size: 8 + max_num_tokens: 8192 + max_seq_len: 1064 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + moe_config: + backend: CUTLASS + cuda_graph_config: null + disable_overlap_scheduler: true + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + decode: + # Decode Worker Config for Dynamo DSR1 (TEP c=16) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + enable_chunked_prefill: false + max_batch_size: 256 + max_num_tokens: 256 + max_seq_len: 2088 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8192 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + - 136 + - 144 + - 152 + - 160 + - 168 + - 176 + - 184 + - 192 + - 200 + - 208 + - 216 + - 224 + - 232 + - 240 + - 248 + - 256 + disable_overlap_scheduler: false + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "72" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx. + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml new file mode 100644 index 000000000..3bacea3c6 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml @@ -0,0 +1,117 @@ +name: "c128_ctx2_gen1_dep8_batch32_eplb0_mtp2" + +model: + path: "dsr1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: "fp8" + +sbatch_directives: + cpus-per-gpu: "16" + +resources: + gpu_type: "h200" + prefill_nodes: 2 + prefill_workers: 2 + + decode_workers: 1 + decode_nodes: 1 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + trtllm_config: + prefill: + # Prefill Worker Config for Dynamo DSR1 (MTP mode) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_chunked_prefill: false + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 32768 + moe_config: + backend: CUTLASS + cuda_graph_config: null + disable_overlap_scheduler: true + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 + decode: + # Decode Worker Config for Dynamo DSR1 (MTP c=128) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_chunked_prefill: false + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 9256 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cuda_graph_config: + enable_padding: true + batch_sizes: [1, 2, 4, 8, 16, 32] + disable_overlap_scheduler: false + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "128" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx. + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml new file mode 100644 index 000000000..eaa4536a4 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml @@ -0,0 +1,117 @@ +name: "c16_ctx1_gen3_tep8_batch32_eplb0_mtp2" + +model: + path: "dsr1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: "fp8" + +sbatch_directives: + cpus-per-gpu: "16" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + + decode_workers: 3 + decode_nodes: 3 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + trtllm_config: + prefill: + # Prefill Worker Config for Dynamo DSR1 (MTP mode) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_chunked_prefill: false + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 32768 + moe_config: + backend: CUTLASS + cuda_graph_config: null + disable_overlap_scheduler: true + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 + decode: + # Decode Worker Config for Dynamo DSR1 (MTP c=16) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_chunked_prefill: false + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 9256 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cuda_graph_config: + enable_padding: true + batch_sizes: [1, 2, 4, 8, 16, 32] + disable_overlap_scheduler: false + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "48" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx. + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml new file mode 100644 index 000000000..d84bf05a5 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml @@ -0,0 +1,117 @@ +name: "c1_ctx1_gen7_tep8_batch1_eplb0_mtp3" + +model: + path: "dsr1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: "fp8" + +sbatch_directives: + cpus-per-gpu: "16" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + + decode_workers: 7 + decode_nodes: 7 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + trtllm_config: + prefill: + # Prefill Worker Config for Dynamo DSR1 (MTP mode) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_chunked_prefill: false + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 32768 + moe_config: + backend: CUTLASS + cuda_graph_config: null + disable_overlap_scheduler: true + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + # Decode Worker Config for Dynamo DSR1 (MTP c=4) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_chunked_prefill: false + max_batch_size: 1 + max_num_tokens: 4 + max_seq_len: 9256 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cuda_graph_config: + enable_padding: true + batch_sizes: [1] + disable_overlap_scheduler: false + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "9" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx. + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml new file mode 100644 index 000000000..19fa4c9f0 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml @@ -0,0 +1,117 @@ +name: "c256_ctx3_gen1_dep8_batch32_eplb0_mtp2" + +model: + path: "dsr1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: "fp8" + +sbatch_directives: + cpus-per-gpu: "16" + +resources: + gpu_type: "h200" + prefill_nodes: 3 + prefill_workers: 3 + + decode_workers: 1 + decode_nodes: 1 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + trtllm_config: + prefill: + # Prefill Worker Config for Dynamo DSR1 (MTP mode) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_chunked_prefill: false + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 32768 + moe_config: + backend: CUTLASS + cuda_graph_config: null + disable_overlap_scheduler: true + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 + decode: + # Decode Worker Config for Dynamo DSR1 (MTP c=256) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_chunked_prefill: false + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 9256 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cuda_graph_config: + enable_padding: true + batch_sizes: [1, 2, 4, 8, 16, 32] + disable_overlap_scheduler: false + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx. + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml new file mode 100644 index 000000000..6eca7fe9d --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml @@ -0,0 +1,117 @@ +name: "c32_ctx3_gen5_tep8_batch32_eplb0_mtp3" + +model: + path: "dsr1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: "fp8" + +sbatch_directives: + cpus-per-gpu: "16" + +resources: + gpu_type: "h200" + prefill_nodes: 3 + prefill_workers: 3 + + decode_workers: 5 + decode_nodes: 5 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + trtllm_config: + prefill: + # Prefill Worker Config for Dynamo DSR1 (MTP mode) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_chunked_prefill: false + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 32768 + moe_config: + backend: CUTLASS + cuda_graph_config: null + disable_overlap_scheduler: true + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + # Decode Worker Config for Dynamo DSR1 (MTP c=32) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_chunked_prefill: false + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 9256 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cuda_graph_config: + enable_padding: true + batch_sizes: [1, 2, 4, 8, 16, 32] + disable_overlap_scheduler: false + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "160" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx. + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml new file mode 100644 index 000000000..6cfd09aad --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml @@ -0,0 +1,117 @@ +name: "c4_ctx1_gen7_tep8_batch32_eplb0_mtp3" + +model: + path: "dsr1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: "fp8" + +sbatch_directives: + cpus-per-gpu: "16" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + + decode_workers: 7 + decode_nodes: 7 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + trtllm_config: + prefill: + # Prefill Worker Config for Dynamo DSR1 (MTP mode) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_chunked_prefill: false + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 32768 + moe_config: + backend: CUTLASS + cuda_graph_config: null + disable_overlap_scheduler: true + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + # Decode Worker Config for Dynamo DSR1 (MTP c=4) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_chunked_prefill: false + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 9256 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cuda_graph_config: + enable_padding: true + batch_sizes: [1, 2, 4, 8, 16, 32] + disable_overlap_scheduler: false + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "28" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx. + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml new file mode 100644 index 000000000..ab5a8fa71 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml @@ -0,0 +1,117 @@ +name: "c512_ctx3_gen1_dep8_batch64_eplb0_mtp1" + +model: + path: "dsr1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: "fp8" + +sbatch_directives: + cpus-per-gpu: "16" + +resources: + gpu_type: "h200" + prefill_nodes: 3 + prefill_workers: 3 + + decode_workers: 1 + decode_nodes: 1 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + trtllm_config: + prefill: + # Prefill Worker Config for Dynamo DSR1 (MTP mode) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_chunked_prefill: false + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 32768 + moe_config: + backend: CUTLASS + cuda_graph_config: null + disable_overlap_scheduler: true + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + decode: + # Decode Worker Config for Dynamo DSR1 (MTP c=512) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_chunked_prefill: false + max_batch_size: 64 + max_num_tokens: 256 + max_seq_len: 9256 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cuda_graph_config: + enable_padding: true + batch_sizes: [1, 2, 4, 8, 16, 32, 64] + disable_overlap_scheduler: false + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "512" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx. + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml new file mode 100644 index 000000000..219a6f1b8 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml @@ -0,0 +1,117 @@ +name: "c64_ctx1_gen1_dep8_batch32_eplb0_mtp2" + +model: + path: "dsr1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: "fp8" + +sbatch_directives: + cpus-per-gpu: "16" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + + decode_workers: 1 + decode_nodes: 1 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + trtllm_config: + prefill: + # Prefill Worker Config for Dynamo DSR1 (MTP mode) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_chunked_prefill: false + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 32768 + moe_config: + backend: CUTLASS + cuda_graph_config: null + disable_overlap_scheduler: true + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 + decode: + # Decode Worker Config for Dynamo DSR1 (MTP c=64) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_chunked_prefill: false + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 9256 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cuda_graph_config: + enable_padding: true + batch_sizes: [1, 2, 4, 8, 16, 32] + disable_overlap_scheduler: false + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 2 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "64" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx. + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml new file mode 100644 index 000000000..d8dd374c2 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml @@ -0,0 +1,117 @@ +name: "c8_ctx1_gen6_tep8_batch32_eplb0_mtp3" + +model: + path: "dsr1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: "fp8" + +sbatch_directives: + cpus-per-gpu: "16" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + + decode_workers: 6 + decode_nodes: 6 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + trtllm_config: + prefill: + # Prefill Worker Config for Dynamo DSR1 (MTP mode) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_chunked_prefill: false + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 32768 + moe_config: + backend: CUTLASS + cuda_graph_config: null + disable_overlap_scheduler: true + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + decode: + # Decode Worker Config for Dynamo DSR1 (MTP c=8) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_chunked_prefill: false + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 9256 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cuda_graph_config: + enable_padding: true + batch_sizes: [1, 2, 4, 8, 16, 32] + disable_overlap_scheduler: false + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "48" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx. + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml new file mode 100644 index 000000000..b92ecafe9 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml @@ -0,0 +1,114 @@ +name: "c128_ctx1_gen1_dep8_batch256_eplb0_mtp0" + +model: + path: "dsr1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: "fp8" + +sbatch_directives: + cpus-per-gpu: "16" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + + decode_workers: 1 + decode_nodes: 1 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + trtllm_config: + prefill: + # Prefill Worker Config for Dynamo DSR1 (DEP mode) + # ISL/OSL: 8k/1k, TP=8 on H200 + # Matches E2E standalone ctx_config.yaml + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_chunked_prefill: false + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 32768 + moe_config: + backend: CUTLASS + cuda_graph_config: null + disable_overlap_scheduler: true + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + + decode: + # Decode Worker Config for Dynamo DSR1 (DEP mode) + # ISL/OSL: 8k/1k, TP=8 on H200 + # Matches E2E standalone gen_config.yaml (DEP c=128) + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_chunked_prefill: false + max_batch_size: 256 + max_num_tokens: 256 + max_seq_len: 9256 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cuda_graph_config: + enable_padding: true + batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256] + disable_overlap_scheduler: false + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "128" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx. + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml new file mode 100644 index 000000000..65eddfb81 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml @@ -0,0 +1,111 @@ +name: "c16_ctx1_gen3_tep8_batch32_eplb0_mtp0" + +model: + path: "dsr1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: "fp8" + +sbatch_directives: + cpus-per-gpu: "16" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + + decode_workers: 3 + decode_nodes: 3 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + trtllm_config: + prefill: + # Prefill Worker Config for Dynamo DSR1 (TEP mode) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_chunked_prefill: false + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 32768 + moe_config: + backend: CUTLASS + cuda_graph_config: null + disable_overlap_scheduler: true + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + decode: + # Decode Worker Config for Dynamo DSR1 (TEP c=16) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_chunked_prefill: false + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 9256 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cuda_graph_config: + enable_padding: true + batch_sizes: [1, 2, 4, 8, 16, 32] + disable_overlap_scheduler: false + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "48" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx. + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml new file mode 100644 index 000000000..f42e7d15d --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml @@ -0,0 +1,111 @@ +name: "c1_ctx1_gen7_tep8_batch1_eplb0_mtp0" + +model: + path: "dsr1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: "fp8" + +sbatch_directives: + cpus-per-gpu: "16" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + + decode_workers: 7 + decode_nodes: 7 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + trtllm_config: + prefill: + # Prefill Worker Config for Dynamo DSR1 (TEP mode) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_chunked_prefill: false + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 32768 + moe_config: + backend: CUTLASS + cuda_graph_config: null + disable_overlap_scheduler: true + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + decode: + # Decode Worker Config for Dynamo DSR1 (TEP c=4) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_chunked_prefill: false + max_batch_size: 1 + max_num_tokens: 1 + max_seq_len: 9256 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cuda_graph_config: + enable_padding: true + batch_sizes: [1] + disable_overlap_scheduler: false + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "9" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx. + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml new file mode 100644 index 000000000..5f96d875a --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml @@ -0,0 +1,111 @@ +name: "c256_ctx5_gen3_dep8_batch256_eplb0_mtp0" + +model: + path: "dsr1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: "fp8" + +sbatch_directives: + cpus-per-gpu: "16" + +resources: + gpu_type: "h200" + prefill_nodes: 5 + prefill_workers: 5 + + decode_workers: 3 + decode_nodes: 3 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + trtllm_config: + prefill: + # Prefill Worker Config for Dynamo DSR1 (DEP mode) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_chunked_prefill: false + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 32768 + moe_config: + backend: CUTLASS + cuda_graph_config: null + disable_overlap_scheduler: true + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + decode: + # Decode Worker Config for Dynamo DSR1 (DEP c=256) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_chunked_prefill: false + max_batch_size: 256 + max_num_tokens: 256 + max_seq_len: 9256 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cuda_graph_config: + enable_padding: true + batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256] + disable_overlap_scheduler: false + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "768" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx. + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml new file mode 100644 index 000000000..5f2976b4d --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml @@ -0,0 +1,111 @@ +name: "c32_ctx2_gen5_tep8_batch128_eplb0_mtp0" + +model: + path: "dsr1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: "fp8" + +sbatch_directives: + cpus-per-gpu: "16" + +resources: + gpu_type: "h200" + prefill_nodes: 2 + prefill_workers: 2 + + decode_workers: 5 + decode_nodes: 5 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + trtllm_config: + prefill: + # Prefill Worker Config for Dynamo DSR1 (TEP mode) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_chunked_prefill: false + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 32768 + moe_config: + backend: CUTLASS + cuda_graph_config: null + disable_overlap_scheduler: true + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + decode: + # Decode Worker Config for Dynamo DSR1 (TEP c=32) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_chunked_prefill: false + max_batch_size: 128 + max_num_tokens: 128 + max_seq_len: 9256 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cuda_graph_config: + enable_padding: true + batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128] + disable_overlap_scheduler: false + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "160" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx. + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml new file mode 100644 index 000000000..72974bb20 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml @@ -0,0 +1,111 @@ +name: "c4_ctx1_gen7_tep8_batch32_eplb0_mtp0" + +model: + path: "dsr1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: "fp8" + +sbatch_directives: + cpus-per-gpu: "16" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + + decode_workers: 7 + decode_nodes: 7 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + trtllm_config: + prefill: + # Prefill Worker Config for Dynamo DSR1 (TEP mode) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_chunked_prefill: false + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 32768 + moe_config: + backend: CUTLASS + cuda_graph_config: null + disable_overlap_scheduler: true + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + decode: + # Decode Worker Config for Dynamo DSR1 (TEP c=4) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_chunked_prefill: false + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 9256 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cuda_graph_config: + enable_padding: true + batch_sizes: [1, 2, 4, 8, 16, 32] + disable_overlap_scheduler: false + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "28" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx. + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml new file mode 100644 index 000000000..a7a96394c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml @@ -0,0 +1,111 @@ +name: "c512_ctx3_gen1_dep8_batch512_eplb0_mtp0" + +model: + path: "dsr1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: "fp8" + +sbatch_directives: + cpus-per-gpu: "16" + +resources: + gpu_type: "h200" + prefill_nodes: 3 + prefill_workers: 3 + + decode_workers: 1 + decode_nodes: 1 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + trtllm_config: + prefill: + # Prefill Worker Config for Dynamo DSR1 (DEP mode) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_chunked_prefill: false + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 32768 + moe_config: + backend: CUTLASS + cuda_graph_config: null + disable_overlap_scheduler: true + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + decode: + # Decode Worker Config for Dynamo DSR1 (DEP c=512) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_chunked_prefill: false + max_batch_size: 512 + max_num_tokens: 512 + max_seq_len: 9256 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cuda_graph_config: + enable_padding: true + batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512] + disable_overlap_scheduler: false + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "512" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx. + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml new file mode 100644 index 000000000..2a27575f2 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml @@ -0,0 +1,111 @@ +name: "c64_ctx2_gen3_dep8_batch128_eplb0_mtp0" + +model: + path: "dsr1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: "fp8" + +sbatch_directives: + cpus-per-gpu: "16" + +resources: + gpu_type: "h200" + prefill_nodes: 2 + prefill_workers: 2 + + decode_workers: 3 + decode_nodes: 3 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + trtllm_config: + prefill: + # Prefill Worker Config for Dynamo DSR1 (DEP mode) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_chunked_prefill: false + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 32768 + moe_config: + backend: CUTLASS + cuda_graph_config: null + disable_overlap_scheduler: true + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + decode: + # Decode Worker Config for Dynamo DSR1 (DEP c=64) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: true + enable_chunked_prefill: false + max_batch_size: 128 + max_num_tokens: 128 + max_seq_len: 9256 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cuda_graph_config: + enable_padding: true + batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128] + disable_overlap_scheduler: false + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "192" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx. + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml new file mode 100644 index 000000000..602646d9c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml @@ -0,0 +1,111 @@ +name: "c8_ctx1_gen6_tep8_batch16_eplb0_mtp0" + +model: + path: "dsr1" + container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + precision: "fp8" + +sbatch_directives: + cpus-per-gpu: "16" + +resources: + gpu_type: "h200" + prefill_nodes: 1 + prefill_workers: 1 + + decode_workers: 6 + decode_nodes: 6 + + gpus_per_node: 8 + +backend: + type: trtllm + + prefill_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + decode_environment: + UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp" + TRTLLM_ENABLE_PDL: "1" + TRTLLM_SERVER_DISABLE_GC: "1" + TRTLLM_WORKER_DISABLE_GC: "1" + NCCL_GRAPH_MIXING_SUPPORT: "0" + + trtllm_config: + prefill: + # Prefill Worker Config for Dynamo DSR1 (TEP mode) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_chunked_prefill: false + max_batch_size: 2 + max_num_tokens: 16640 + max_seq_len: 8232 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 32768 + moe_config: + backend: CUTLASS + cuda_graph_config: null + disable_overlap_scheduler: true + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + decode: + # Decode Worker Config for Dynamo DSR1 (TEP c=8) + # ISL/OSL: 8k/1k, TP=8 on H200 + backend: pytorch + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + enable_attention_dp: false + enable_chunked_prefill: false + max_batch_size: 16 + max_num_tokens: 16 + max_seq_len: 9256 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 16384 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cuda_graph_config: + enable_padding: true + batch_sizes: [1, 2, 4, 8, 16] + disable_overlap_scheduler: false + print_iter_log: true + # Performance tuning + stream_interval: 100 + num_postprocess_workers: 4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "48" + req_rate: "inf" + +frontend: + type: "dynamo" + enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx. + +dynamo: + install: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml new file mode 100644 index 000000000..ecdc9233a --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml @@ -0,0 +1,101 @@ +name: "kimi-vllm-disagg-gb200-1p1d-dep4-dep16" + +model: + path: "kimi-k2.5-nvfp4" + container: "vllm/vllm-openai:v0.18.0-cu130" + precision: "fp4" + +dynamo: + version: 1.0.1 + install: true + +setup_script: vllm-container-deps.sh + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 4 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + decode_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 3072 + max-num-seqs: 4096 + enforce-eager: true + compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + max-num-batched-tokens: 16384 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}' + all2all-backend: "flashinfer_nvlink_one_sided" + gpu-memory-utilization: 0.9 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 3072 + max-num-seqs: 4096 + max-num-batched-tokens: 10240 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + async-scheduling: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + all2all-backend: "flashinfer_nvlink_one_sided" + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + max-cudagraph-capture-size: 512 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "256x512x1024x2048x3072x4096" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml new file mode 100644 index 000000000..43167b5f3 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml @@ -0,0 +1,98 @@ +name: "kimi-vllm-disagg-gb200-1p4d-dep4-tep4" + +model: + path: "kimi-k2.5-nvfp4" + container: "vllm/vllm-openai:v0.18.0-cu130" + precision: "fp4" + +dynamo: + version: 1.0.1 + install: true + +setup_script: vllm-container-deps.sh + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 4 + prefill_workers: 1 + decode_workers: 4 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + decode_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 3072 + max-num-seqs: 1024 + enforce-eager: true + compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + max-num-batched-tokens: 16384 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}' + all2all-backend: "flashinfer_nvlink_one_sided" + gpu-memory-utilization: 0.9 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + max-model-len: 3072 + max-num-seqs: 1024 + max-num-batched-tokens: 10240 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + async-scheduling: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + max-cudagraph-capture-size: 1024 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4x8x16x32x64x128" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml new file mode 100644 index 000000000..1ab6ca279 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml @@ -0,0 +1,98 @@ +name: "kimi-vllm-disagg-gb200-1p4d-dep4-tep4" + +model: + path: "kimi-k2.5-nvfp4" + container: "vllm/vllm-openai:v0.18.0-cu130" + precision: "fp4" + +dynamo: + version: 1.0.1 + install: true + +setup_script: vllm-container-deps.sh + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 4 + prefill_workers: 1 + decode_workers: 4 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + decode_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 10240 + max-num-seqs: 64 + enforce-eager: true + compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + max-num-batched-tokens: 16384 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}' + all2all-backend: "flashinfer_nvlink_one_sided" + gpu-memory-utilization: 0.9 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + max-model-len: 10240 + max-num-seqs: 16 + max-num-batched-tokens: 10240 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + async-scheduling: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + max-cudagraph-capture-size: 16 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4x8x16x32x128" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml new file mode 100644 index 000000000..ca4e9813f --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml @@ -0,0 +1,101 @@ +name: "kimi-vllm-disagg-gb200-3p1d-dep4-dep16" + +model: + path: "kimi-k2.5-nvfp4" + container: "vllm/vllm-openai:v0.18.0-cu130" + precision: "fp4" + +dynamo: + version: 1.0.1 + install: true + +setup_script: vllm-container-deps.sh + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 3 + decode_nodes: 4 + prefill_workers: 3 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + decode_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 10240 + max-num-seqs: 64 + enforce-eager: true + compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + max-num-batched-tokens: 16384 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}' + all2all-backend: "flashinfer_nvlink_one_sided" + gpu-memory-utilization: 0.9 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 10240 + max-num-seqs: 256 + max-num-batched-tokens: 10240 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + async-scheduling: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + all2all-backend: "flashinfer_nvlink_one_sided" + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + max-cudagraph-capture-size: 256 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "512x1024" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml new file mode 100644 index 000000000..cd9f94a9d --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml @@ -0,0 +1,101 @@ +name: "kimi-vllm-disagg-gb200-5p1d-dep4-dep8" + +model: + path: "kimi-k2.5-nvfp4" + container: "vllm/vllm-openai:v0.18.0-cu130" + precision: "fp4" + +dynamo: + version: 1.0.1 + install: true + +setup_script: vllm-container-deps.sh + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 5 + decode_nodes: 2 + prefill_workers: 5 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + decode_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 10240 + max-num-seqs: 64 + enforce-eager: true + compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + max-num-batched-tokens: 16384 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}' + all2all-backend: "flashinfer_nvlink_one_sided" + gpu-memory-utilization: 0.9 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 10240 + max-num-seqs: 512 + max-num-batched-tokens: 10240 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + async-scheduling: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + all2all-backend: "flashinfer_nvlink_one_sided" + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + max-cudagraph-capture-size: 512 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2048" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml new file mode 100644 index 000000000..47d3d7ee5 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml @@ -0,0 +1,101 @@ +name: "kimi-vllm-disagg-gb200-6p1d-dep4-dep16" + +model: + path: "kimi-k2.5-nvfp4" + container: "vllm/vllm-openai:v0.18.0-cu130" + precision: "fp4" + +dynamo: + version: 1.0.1 + install: true + +setup_script: vllm-container-deps.sh + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 6 + decode_nodes: 4 + prefill_workers: 6 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + decode_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 10240 + max-num-seqs: 64 + enforce-eager: true + compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + max-num-batched-tokens: 16384 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}' + all2all-backend: "flashinfer_nvlink_one_sided" + gpu-memory-utilization: 0.9 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 10240 + max-num-seqs: 512 + max-num-batched-tokens: 10240 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + async-scheduling: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + all2all-backend: "flashinfer_nvlink_one_sided" + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + max-cudagraph-capture-size: 512 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "3072x4096" + req_rate: "inf" diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh index edf5db957..f465b4cdf 100644 --- a/runners/launch_b200-dgxc.sh +++ b/runners/launch_b200-dgxc.sh @@ -105,7 +105,7 @@ EOF echo "Submitting job with srtctl..." if [[ -z "$CONFIG_FILE" ]]; then - echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2 + echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a 'recipe:' field on the search-space entry (resolved by benchmark-multinode-tmpl.yml)." >&2 echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2 exit 1 fi diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index 3c855e805..e7fd1ea49 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -108,7 +108,7 @@ export INFMAX_WORKSPACE="$GITHUB_WORKSPACE" echo "Submitting job with srtctl..." if [[ -z "$CONFIG_FILE" ]]; then - echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2 + echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a 'recipe:' field on the search-space entry (resolved by benchmark-multinode-tmpl.yml)." >&2 echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2 exit 1 fi diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 224c3a928..948689c76 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -125,10 +125,11 @@ PY fi -# srt-slurm path requires a CONFIG_FILE pointing to a recipe YAML. -# Without it, srtctl apply scans every YAML in the repo and submits hundreds of jobs. +# srt-slurm path requires CONFIG_FILE (set by benchmark-multinode-tmpl.yml from +# the search-space `recipe:` field). Without it, srtctl apply scans every YAML +# in the repo and submits hundreds of jobs. if [[ -z "$CONFIG_FILE" ]]; then - echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2 + echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a 'recipe:' field on the search-space entry (resolved by benchmark-multinode-tmpl.yml)." >&2 echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2 exit 1 fi @@ -140,21 +141,10 @@ if [ -d "$SRT_REPO_DIR" ]; then rm -rf "$SRT_REPO_DIR" fi -if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then - git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" - cd "$SRT_REPO_DIR" - git checkout sa-submission-q2-2026 - # Use `cp -rT` so if the upstream branch ever ships a stub - # `recipes/vllm/deepseek-v4/` directory, we overlay our recipes onto - # it rather than nesting (`cp -r src dst` would create - # `recipes/vllm/deepseek-v4/deepseek-v4/...` in that case). - mkdir -p recipes/vllm/deepseek-v4 - cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4 -elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then - git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" - cd "$SRT_REPO_DIR" - git checkout sa-submission-q2-2026 -elif [[ $FRAMEWORK == "dynamo-trt" && $MODEL_PREFIX == "kimik2.5" ]]; then +# We only clone srt-slurm to install srtctl + pick up its sibling configs +# (configs/, expert-distributions/, etc). The recipe itself is supplied as an +# absolute CONFIG_FILE pointing at benchmarks/multi_node/srt-slurm-recipes/. +if [[ $FRAMEWORK == "dynamo-vllm" || ( $FRAMEWORK == "dynamo-trt" && $MODEL_PREFIX == "kimik2.5" ) ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" git checkout sa-submission-q2-2026 diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh index 5f48ddcec..9af41a1ef 100644 --- a/runners/launch_gb300-nv.sh +++ b/runners/launch_gb300-nv.sh @@ -103,7 +103,7 @@ export INFMAX_WORKSPACE="$GITHUB_WORKSPACE" echo "Submitting job with srtctl..." if [[ -z "$CONFIG_FILE" ]]; then - echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2 + echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a 'recipe:' field on the search-space entry (resolved by benchmark-multinode-tmpl.yml)." >&2 echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2 exit 1 fi diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh index 5a2ab64d2..f531c19bf 100644 --- a/runners/launch_h100-dgxc-slurm.sh +++ b/runners/launch_h100-dgxc-slurm.sh @@ -126,7 +126,7 @@ EOF echo "Submitting job with srtctl..." if [[ -z "$CONFIG_FILE" ]]; then - echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2 + echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a 'recipe:' field on the search-space entry (resolved by benchmark-multinode-tmpl.yml)." >&2 echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2 exit 1 fi diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh index e11ca7b20..368577d7c 100755 --- a/runners/launch_h200-dgxc-slurm.sh +++ b/runners/launch_h200-dgxc-slurm.sh @@ -119,7 +119,7 @@ EOF echo "Submitting job with srtctl..." if [[ -z "$CONFIG_FILE" ]]; then - echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2 + echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a 'recipe:' field on the search-space entry (resolved by benchmark-multinode-tmpl.yml)." >&2 echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2 exit 1 fi diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py index e543bb4af..44613e8eb 100644 --- a/utils/matrix_logic/generate_sweep_configs.py +++ b/utils/matrix_logic/generate_sweep_configs.py @@ -267,6 +267,8 @@ def generate_full_sweep(args, all_config_data, runner_data): seq_len_str = seq_len_to_str(isl, osl) runners_for_entry = runner_nodes_to_use if runner_nodes_to_use else [runner] + recipe = bmk.get(Fields.RECIPE.value) + for runner_value in runners_for_entry: entry = { Fields.IMAGE.value: image, @@ -285,6 +287,7 @@ def generate_full_sweep(args, all_config_data, runner_data): Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}", Fields.DISAGG.value: disagg, Fields.RUN_EVAL.value: False, # Default, may be overridden by mark_eval_entries + Fields.RECIPE.value: recipe, } validate_matrix_entry(entry, is_multinode) @@ -463,6 +466,7 @@ def get_lowest_conc(search_space_entry): Fields.SPEC_DECODING.value, "none") prefill_config = lowest_conc_entry[Fields.PREFILL.value] decode_config = lowest_conc_entry[Fields.DECODE.value] + recipe = lowest_conc_entry.get(Fields.RECIPE.value) for node in runner_nodes: entry = { @@ -494,6 +498,7 @@ def get_lowest_conc(search_space_entry): Fields.EXP_NAME.value: f"{model_code}_test", Fields.DISAGG.value: disagg, Fields.RUN_EVAL.value: False, + Fields.RECIPE.value: recipe, } matrix_values.append(validate_matrix_entry(entry, is_multinode=True)) else: @@ -620,6 +625,7 @@ def generate_test_config_sweep(args, all_config_data): Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}", Fields.DISAGG.value: disagg, Fields.RUN_EVAL.value: False, + Fields.RECIPE.value: bmk.get(Fields.RECIPE.value), } matrix_values.append(validate_matrix_entry(entry, is_multinode=True)) else: diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py index ce10840b5..7f1fa3326 100644 --- a/utils/matrix_logic/validation.py +++ b/utils/matrix_logic/validation.py @@ -1,3 +1,5 @@ +from pathlib import Path + from pydantic import BaseModel, Field, ValidationError, ConfigDict, model_validator from typing import List, Optional, Union, Literal from enum import Enum @@ -5,6 +7,11 @@ import pprint import yaml +# Repo-relative root for first-class srt-slurm recipes referenced by the +# `recipe:` field on multi-node search-space entries. Resolved against the +# repository root (parent of utils/) so callers can run from any cwd. +RECIPES_ROOT = Path(__file__).resolve().parents[2] / "benchmarks" / "multi_node" / "srt-slurm-recipes" + """ The below class defines the field names expected to be present in the JSON entries for both single-node and multi-node configurations. @@ -44,6 +51,7 @@ class Fields(Enum): BATCH_SIZE = 'batch-size' MAX_NUM_TOKENS = 'max-num-tokens' ADDITIONAL_SETTINGS = 'additional-settings' + RECIPE = 'recipe' # Matrix entry fields CONC = 'conc' @@ -131,6 +139,11 @@ class MultiNodeMatrixEntry(BaseModel): run_eval: bool = Field(alias=Fields.RUN_EVAL.value) eval_only: bool = Field(alias=Fields.EVAL_ONLY.value, default=False) eval_conc: Optional[int] = Field(default=None, alias=Fields.EVAL_CONC.value) + # Path under benchmarks/multi_node/srt-slurm-recipes/ identifying the + # srt-slurm recipe to dispatch. May carry an `:override[N]` suffix that the + # launcher strips before resolving the file on disk. Optional because not + # every multi-node config uses srt-slurm. + recipe: Optional[str] = None def validate_matrix_entry(entry: dict, is_multinode: bool) -> dict: @@ -234,11 +247,31 @@ class MultiNodeSearchSpaceEntry(BaseModel): default=None, alias=Fields.CONC_END.value) conc_list: Optional[List[int]] = Field( default=None, alias=Fields.CONC_LIST.value) + # First-class srt-slurm recipe reference. Path is relative to + # benchmarks/multi_node/srt-slurm-recipes/ and may carry an + # `:override[N]` suffix to select an in-yaml override section. + recipe: Optional[str] = None @model_validator(mode='after') def validate_conc_fields(self): return _validate_conc_fields(self) + @model_validator(mode='after') + def validate_recipe_exists(self): + if self.recipe is None: + return self + # Strip `:override[...]` suffix used by sglang-style recipes that + # carry multiple override sections in one file. + recipe_path = self.recipe.split(':', 1)[0] + full_path = RECIPES_ROOT / recipe_path + if not full_path.is_file(): + raise ValueError( + f"Recipe file not found: '{self.recipe}' " + f"(resolved to '{full_path}'). " + f"Recipes must live under benchmarks/multi_node/srt-slurm-recipes/." + ) + return self + class SingleNodeSeqLenConfig(BaseModel): """Single node sequence length configuration.""" From 89bf3e37ca993c91d8d998b3c280962e4360b504 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 28 Apr 2026 09:56:21 -0500 Subject: [PATCH 02/16] runners: factor srt-slurm clone+srtctl install into benchmark_lib helper Six launchers each carried a ~22-line copy of the same git-clone, uv-install, venv-create, srtctl-install sequence. Lift it into clone_and_install_srtctl() in benchmarks/benchmark_lib.sh, parameterized by SRT_REPO_URL/SRT_BRANCH and UV_INSTALL_DIR/UV_VENV_DIR env vars so each launcher can keep its workspace- vs-NFS-vs-default-HOME placement decisions explicit at the call site. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/benchmark_lib.sh | 49 +++++++++++++++++++++++++++++++ runners/launch_b200-dgxc.sh | 29 ++++-------------- runners/launch_b300-nv.sh | 29 ++++-------------- runners/launch_gb200-nv.sh | 34 +++++---------------- runners/launch_gb300-nv.sh | 29 ++++-------------- runners/launch_h100-dgxc-slurm.sh | 35 +++++----------------- runners/launch_h200-dgxc-slurm.sh | 26 ++-------------- 7 files changed, 82 insertions(+), 149 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 268745735..576cf7c4b 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -862,3 +862,52 @@ run_eval() { fi return $eval_rc } + +# -------------------------------- +# srt-slurm helpers +# -------------------------------- + +# Clone srt-slurm and install `srtctl` into a uv venv. After this returns +# successfully, cwd is the cloned repo and the venv is active. Idempotent on +# uv: skips re-curl if the binary is already present at $UV_INSTALL_DIR. +# +# All inputs are env vars (set before calling); all are optional: +# SRT_REPO_URL default https://github.com/NVIDIA/srt-slurm.git +# SRT_BRANCH default sa-submission-q2-2026 +# SRT_REPO_DIR default srt-slurm (relative to current cwd) +# UV_INSTALL_DIR default $HOME/.local/bin (uv's own default) +# UV_VENV_DIR default .venv (inside the cloned repo) +clone_and_install_srtctl() { + local repo_url="${SRT_REPO_URL:-https://github.com/NVIDIA/srt-slurm.git}" + local branch="${SRT_BRANCH:-sa-submission-q2-2026}" + local repo_dir="${SRT_REPO_DIR:-srt-slurm}" + local uv_install_dir="${UV_INSTALL_DIR:-${HOME}/.local/bin}" + local uv_venv_dir="${UV_VENV_DIR:-.venv}" + + echo "Cloning ${repo_url}@${branch} into ${repo_dir}..." + rm -rf "$repo_dir" + git clone "$repo_url" "$repo_dir" + cd "$repo_dir" || return 1 + git checkout "$branch" + + echo "Installing uv + srtctl into venv at ${uv_venv_dir}..." + export UV_INSTALL_DIR="$uv_install_dir" + mkdir -p "$uv_install_dir" + if ! [ -x "$uv_install_dir/uv" ]; then + curl -LsSf https://astral.sh/uv/install.sh | sh + fi + export PATH="$uv_install_dir:$PATH" + # uv's installer drops an `env` script next to the binary; source it so + # PATH/PS1 changes pick up in shells that don't re-read the env. + [ -f "$uv_install_dir/env" ] && source "$uv_install_dir/env" + + uv venv "$uv_venv_dir" + # shellcheck disable=SC1091 + source "$uv_venv_dir/bin/activate" + uv pip install -e . + + if ! command -v srtctl &> /dev/null; then + echo "Error: Failed to install srtctl" >&2 + return 1 + fi +} diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh index f465b4cdf..f0ad3deed 100644 --- a/runners/launch_b200-dgxc.sh +++ b/runners/launch_b200-dgxc.sh @@ -4,6 +4,8 @@ SLURM_PARTITION="gpu" SLURM_ACCOUNT="benchmark" +source "$(dirname "$0")/../benchmarks/benchmark_lib.sh" + set -x if [[ "$IS_MULTINODE" == "true" ]]; then @@ -29,30 +31,9 @@ if [[ "$IS_MULTINODE" == "true" ]]; then fi export SERVED_MODEL_NAME=$MODEL - echo "Cloning srt-slurm repository..." - SRT_REPO_DIR="srt-slurm" - if [ -d "$SRT_REPO_DIR" ]; then - echo "Removing existing $SRT_REPO_DIR..." - rm -rf "$SRT_REPO_DIR" - fi - - git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" - cd "$SRT_REPO_DIR" || exit 1 - git checkout sa-submission-q2-2026 - - echo "Installing srtctl..." - export UV_INSTALL_DIR="$GITHUB_WORKSPACE/.local/bin" - curl -LsSf https://astral.sh/uv/install.sh | sh - export PATH="$UV_INSTALL_DIR:$PATH" - - uv venv "$GITHUB_WORKSPACE/.venv" - source "$GITHUB_WORKSPACE/.venv/bin/activate" - uv pip install -e . - - if ! command -v srtctl &> /dev/null; then - echo "Error: Failed to install srtctl" - exit 1 - fi + UV_INSTALL_DIR="$GITHUB_WORKSPACE/.local/bin" \ + UV_VENV_DIR="$GITHUB_WORKSPACE/.venv" \ + clone_and_install_srtctl || exit 1 # Map container images to local squash files NGINX_IMAGE="nginx:1.27.4" diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index e7fd1ea49..6fc373a41 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -4,6 +4,8 @@ SLURM_PARTITION="batch_1" SLURM_ACCOUNT="benchmark" +source "$(dirname "$0")/../benchmarks/benchmark_lib.sh" + set -x if [[ "$IS_MULTINODE" == "true" ]]; then @@ -30,30 +32,9 @@ else exit 1 fi -echo "Cloning srt-slurm repository..." -SRT_REPO_DIR="srt-slurm" -if [ -d "$SRT_REPO_DIR" ]; then - echo "Removing existing $SRT_REPO_DIR..." - rm -rf "$SRT_REPO_DIR" -fi - -git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" -cd "$SRT_REPO_DIR" || exit 1 -git checkout sa-submission-q2-2026 - -echo "Installing srtctl..." -export UV_INSTALL_DIR="$GITHUB_WORKSPACE/.local/bin" -curl -LsSf https://astral.sh/uv/install.sh | sh -export PATH="$UV_INSTALL_DIR:$PATH" - -uv venv "$GITHUB_WORKSPACE/.venv" -source "$GITHUB_WORKSPACE/.venv/bin/activate" -uv pip install -e . - -if ! command -v srtctl &> /dev/null; then - echo "Error: Failed to install srtctl" - exit 1 -fi +UV_INSTALL_DIR="$GITHUB_WORKSPACE/.local/bin" \ +UV_VENV_DIR="$GITHUB_WORKSPACE/.venv" \ + clone_and_install_srtctl || exit 1 # Map container images to local squash files NGINX_IMAGE="nginx:1.27.4" diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 948689c76..13bcd9a5d 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -2,6 +2,8 @@ # This script sets up the environment and launches multi-node benchmarks +source "$(dirname "$0")/../benchmarks/benchmark_lib.sh" + set -x # MODEL_PATH: Override with pre-downloaded paths on GB200 runner @@ -134,38 +136,18 @@ if [[ -z "$CONFIG_FILE" ]]; then exit 1 fi -echo "Cloning srt-slurm repository..." -SRT_REPO_DIR="srt-slurm" -if [ -d "$SRT_REPO_DIR" ]; then - echo "Removing existing $SRT_REPO_DIR..." - rm -rf "$SRT_REPO_DIR" -fi - # We only clone srt-slurm to install srtctl + pick up its sibling configs # (configs/, expert-distributions/, etc). The recipe itself is supplied as an # absolute CONFIG_FILE pointing at benchmarks/multi_node/srt-slurm-recipes/. if [[ $FRAMEWORK == "dynamo-vllm" || ( $FRAMEWORK == "dynamo-trt" && $MODEL_PREFIX == "kimik2.5" ) ]]; then - git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" - cd "$SRT_REPO_DIR" - git checkout sa-submission-q2-2026 + SRT_REPO_URL=https://github.com/NVIDIA/srt-slurm.git + SRT_BRANCH=sa-submission-q2-2026 else - git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR" - cd "$SRT_REPO_DIR" - git checkout sa-submission-q1-2026 -fi - -echo "Installing srtctl..." -curl -LsSf https://astral.sh/uv/install.sh | sh -source $HOME/.local/bin/env - -uv venv -source .venv/bin/activate -uv pip install -e . - -if ! command -v srtctl &> /dev/null; then - echo "Error: Failed to install srtctl" - exit 1 + SRT_REPO_URL=https://github.com/ishandhanani/srt-slurm.git + SRT_BRANCH=sa-submission-q1-2026 fi +SRT_REPO_URL="$SRT_REPO_URL" SRT_BRANCH="$SRT_BRANCH" \ + clone_and_install_srtctl || exit 1 echo "Configs available at: $SRT_REPO_DIR/" diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh index 9af41a1ef..58f82eb83 100644 --- a/runners/launch_gb300-nv.sh +++ b/runners/launch_gb300-nv.sh @@ -2,6 +2,8 @@ # This script sets up the environment and launches multi-node benchmarks +source "$(dirname "$0")/../benchmarks/benchmark_lib.sh" + set -x export SLURM_PARTITION="batch" @@ -36,30 +38,9 @@ export EVAL_ONLY="${EVAL_ONLY:-false}" export ISL="$ISL" export OSL="$OSL" -echo "Cloning srt-slurm repository..." -SRT_REPO_DIR="srt-slurm" -if [ -d "$SRT_REPO_DIR" ]; then - echo "Removing existing $SRT_REPO_DIR..." - rm -rf "$SRT_REPO_DIR" -fi - -git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" -cd "$SRT_REPO_DIR" -git checkout sa-submission-q2-2026 - -echo "Installing srtctl..." -export UV_INSTALL_DIR="$GITHUB_WORKSPACE/.local/bin" -curl -LsSf https://astral.sh/uv/install.sh | sh -export PATH="$UV_INSTALL_DIR:$PATH" - -uv venv "$GITHUB_WORKSPACE/.venv" -source "$GITHUB_WORKSPACE/.venv/bin/activate" -uv pip install -e . - -if ! command -v srtctl &> /dev/null; then - echo "Error: Failed to install srtctl" - exit 1 -fi +UV_INSTALL_DIR="$GITHUB_WORKSPACE/.local/bin" \ +UV_VENV_DIR="$GITHUB_WORKSPACE/.venv" \ + clone_and_install_srtctl || exit 1 echo "Configs available at: $SRT_REPO_DIR/" diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh index f531c19bf..602664a09 100644 --- a/runners/launch_h100-dgxc-slurm.sh +++ b/runners/launch_h100-dgxc-slurm.sh @@ -5,6 +5,8 @@ SLURM_PARTITION="hpc-gpu-1" SLURM_ACCOUNT="customer" SLURM_EXCLUDED_NODELIST="hpc-gpu-1-7" +source "$(dirname "$0")/../benchmarks/benchmark_lib.sh" + set -x if [[ "$IS_MULTINODE" == "true" ]]; then @@ -34,36 +36,13 @@ if [[ "$IS_MULTINODE" == "true" ]]; then exit 1 fi - echo "Cloning srt-slurm repository..." - SRT_REPO_DIR="srt-slurm" - if [ -d "$SRT_REPO_DIR" ]; then - echo "Removing existing $SRT_REPO_DIR..." - rm -rf "$SRT_REPO_DIR" - fi - - git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" - cd "$SRT_REPO_DIR" - git checkout sa-submission-q2-2026 - - echo "Installing srtctl..." - export UV_INSTALL_DIR="/mnt/nfs/sa-shared/.uv/bin" + # Pin uv state onto the NFS-shared volume so cluster nodes share a single + # cached install, and so the binary persists across runner workspaces. export UV_CACHE_DIR="/mnt/nfs/sa-shared/.uv/cache" export UV_PYTHON_INSTALL_DIR="/mnt/nfs/sa-shared/.uv/python" - mkdir -p "$UV_INSTALL_DIR" "$UV_CACHE_DIR" "$UV_PYTHON_INSTALL_DIR" - if ! [ -x "$UV_INSTALL_DIR/uv" ]; then - curl -LsSf https://astral.sh/uv/install.sh | sh - fi - export PATH="$UV_INSTALL_DIR:$PATH" - source $UV_INSTALL_DIR/env - - uv venv - source .venv/bin/activate - uv pip install -e . - - if ! command -v srtctl &> /dev/null; then - echo "Error: Failed to install srtctl" - exit 1 - fi + mkdir -p "$UV_CACHE_DIR" "$UV_PYTHON_INSTALL_DIR" + UV_INSTALL_DIR="/mnt/nfs/sa-shared/.uv/bin" \ + clone_and_install_srtctl || exit 1 echo "Configs available at: $SRT_REPO_DIR/" diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh index 368577d7c..b61cbb0bf 100755 --- a/runners/launch_h200-dgxc-slurm.sh +++ b/runners/launch_h200-dgxc-slurm.sh @@ -4,6 +4,8 @@ SLURM_PARTITION="main" SLURM_ACCOUNT="sa-shared" +source "$(dirname "$0")/../benchmarks/benchmark_lib.sh" + set -x if [[ "$IS_MULTINODE" == "true" ]]; then @@ -33,29 +35,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then exit 1 fi - echo "Cloning srt-slurm repository..." - SRT_REPO_DIR="srt-slurm" - if [ -d "$SRT_REPO_DIR" ]; then - echo "Removing existing $SRT_REPO_DIR..." - rm -rf "$SRT_REPO_DIR" - fi - - git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" - cd "$SRT_REPO_DIR" - git checkout sa-submission-q2-2026 - - echo "Installing srtctl..." - curl -LsSf https://astral.sh/uv/install.sh | sh - source $HOME/.local/bin/env - - uv venv - source .venv/bin/activate - uv pip install -e . - - if ! command -v srtctl &> /dev/null; then - echo "Error: Failed to install srtctl" - exit 1 - fi + clone_and_install_srtctl || exit 1 echo "Configs available at: $SRT_REPO_DIR/" From d29d06615d71747d469a082a923c0d65338e4005 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 28 Apr 2026 10:09:08 -0500 Subject: [PATCH 03/16] runners: factor image-filename sanitization into benchmark_lib helper Lift the `echo "$IMAGE" | sed 's/[/:@#]/_/g'` slug used to name squash files out of 13 launchers and into sanitize_image_filename() in benchmark_lib.sh. Cluster-specific separator (h100/h200-dgxc-slurm use '+' instead of '_') is expressed as the second arg, and the nvcr.io/-prefix-strip variant becomes `sanitize_image_filename "${IMAGE#nvcr.io/}" +` rather than a sed pipeline. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/benchmark_lib.sh | 14 ++++++++++++++ runners/launch_b200-cw.sh | 4 +++- runners/launch_b200-dgxc.sh | 6 +++--- runners/launch_b300-nv.sh | 6 +++--- runners/launch_gb200-nv.sh | 4 ++-- runners/launch_gb300-nv.sh | 4 ++-- runners/launch_h100-cw.sh | 4 +++- runners/launch_h100-dgxc-slurm.sh | 4 ++-- runners/launch_h200-cw.sh | 4 +++- runners/launch_h200-dgxc-slurm.sh | 6 +++--- runners/launch_h200-nb.sh | 4 +++- runners/launch_mi300x-amds.sh | 4 +++- runners/launch_mi325x-amds.sh | 4 +++- runners/launch_mi355x-amds.sh | 4 +++- 14 files changed, 50 insertions(+), 22 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 576cf7c4b..92998de27 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -863,6 +863,20 @@ run_eval() { return $eval_rc } +# -------------------------------- +# Container helpers +# -------------------------------- + +# Sanitize a container image reference (e.g. "lmsysorg/sglang:v0.5.8-cu130") +# into a filename-safe slug by replacing /, :, @, # with the chosen separator. +# Defaults to '_' (most clusters); pass '+' for clusters that adopted that +# convention for their squash-file directory. +sanitize_image_filename() { + local image="$1" + local sep="${2:-_}" + echo "$image" | sed "s|[/:@#]|${sep}|g" +} + # -------------------------------- # srt-slurm helpers # -------------------------------- diff --git a/runners/launch_b200-cw.sh b/runners/launch_b200-cw.sh index 0b2dbf305..fbdd60554 100644 --- a/runners/launch_b200-cw.sh +++ b/runners/launch_b200-cw.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash +source "$(dirname "$0")/../benchmarks/benchmark_lib.sh" + export HF_HUB_CACHE_MOUNT="/tmp/gharunner/hf-hub-cache" export PORT=8888 @@ -16,7 +18,7 @@ if [[ ! -f "$BENCH_SCRIPT" ]]; then fi PARTITION="b200" -SQUASH_FILE="/tmp/gharunner/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +SQUASH_FILE="/tmp/gharunner/squash/$(sanitize_image_filename "$IMAGE").sqsh" LOCK_FILE="${SQUASH_FILE}.lock" # TODO(Cam): lmsysorg/sglang:deepseek-v4-blackwell installs sglang editable at diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh index f0ad3deed..3e294f859 100644 --- a/runners/launch_b200-dgxc.sh +++ b/runners/launch_b200-dgxc.sh @@ -37,8 +37,8 @@ if [[ "$IS_MULTINODE" == "true" ]]; then # Map container images to local squash files NGINX_IMAGE="nginx:1.27.4" - SQUASH_FILE="/home/sa-shared/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" - NGINX_SQUASH_FILE="/home/sa-shared/containers/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh" + SQUASH_FILE="/home/sa-shared/containers/$(sanitize_image_filename "$IMAGE").sqsh" + NGINX_SQUASH_FILE="/home/sa-shared/containers/$(sanitize_image_filename "$NGINX_IMAGE").sqsh" # Import containers via enroot enroot import -o $SQUASH_FILE docker://$IMAGE @@ -231,7 +231,7 @@ EOF else HF_HUB_CACHE_MOUNT="/scratch/fsw/gharunners/hf-hub-cache" - SQUASH_FILE="/home/sa-shared/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" + SQUASH_FILE="/home/sa-shared/containers/$(sanitize_image_filename "$IMAGE").sqsh" FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') # Prefer a framework-tagged script (e.g. dsv4_fp4_b200_vllm.sh) so models diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index 6fc373a41..23f75ac80 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -38,8 +38,8 @@ UV_VENV_DIR="$GITHUB_WORKSPACE/.venv" \ # Map container images to local squash files NGINX_IMAGE="nginx:1.27.4" -SQUASH_FILE="/data/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" -NGINX_SQUASH_FILE="/data/squash/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +SQUASH_FILE="/data/squash/$(sanitize_image_filename "$IMAGE").sqsh" +NGINX_SQUASH_FILE="/data/squash/$(sanitize_image_filename "$NGINX_IMAGE").sqsh" # Import containers via enroot srun -N 1 -A $SLURM_ACCOUNT -p $SLURM_PARTITION bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" @@ -239,7 +239,7 @@ else elif [[ "$MODEL_PREFIX" == "dsv4" ]]; then export MODEL="$HF_HUB_CACHE_MOUNT/dsv4-pro" fi - SQUASH_FILE="/data/home/sa-shared/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" + SQUASH_FILE="/data/home/sa-shared/gharunners/squash/$(sanitize_image_filename "$IMAGE").sqsh" SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') # Prefer a framework-tagged script (e.g. dsv4_fp4_b300_sglang.sh) so models # with multiple inference engines can coexist; fall back to the historical diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 13bcd9a5d..e9c3e62b8 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -64,8 +64,8 @@ export SLURM_ACCOUNT="benchmark" NGINX_IMAGE="nginx:1.27.4" -SQUASH_FILE="/mnt/lustre01/users-public/sa-shared/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" -NGINX_SQUASH_FILE="/mnt/lustre01/users-public/sa-shared/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +SQUASH_FILE="/mnt/lustre01/users-public/sa-shared/$(sanitize_image_filename "$IMAGE").sqsh" +NGINX_SQUASH_FILE="/mnt/lustre01/users-public/sa-shared/$(sanitize_image_filename "$NGINX_IMAGE").sqsh" enroot import -o $SQUASH_FILE docker://$IMAGE enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh index 58f82eb83..a0790260e 100644 --- a/runners/launch_gb300-nv.sh +++ b/runners/launch_gb300-nv.sh @@ -27,8 +27,8 @@ fi NGINX_IMAGE="nginx:1.27.4" -SQUASH_FILE="/home/sa-shared/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" -NGINX_SQUASH_FILE="/home/sa-shared/squash/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +SQUASH_FILE="/home/sa-shared/squash/$(sanitize_image_filename "$IMAGE").sqsh" +NGINX_SQUASH_FILE="/home/sa-shared/squash/$(sanitize_image_filename "$NGINX_IMAGE").sqsh" srun --partition=$SLURM_PARTITION --exclusive --time=180 bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" srun --partition=$SLURM_PARTITION --exclusive --time=180 bash -c "enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE" diff --git a/runners/launch_h100-cw.sh b/runners/launch_h100-cw.sh index f3198ca8c..e036e6219 100644 --- a/runners/launch_h100-cw.sh +++ b/runners/launch_h100-cw.sh @@ -1,8 +1,10 @@ #!/usr/bin/env bash +source "$(dirname "$0")/../benchmarks/benchmark_lib.sh" + export HF_HUB_CACHE_MOUNT="/mnt/vast/gharunner/hf-hub-cache" PARTITION="h100" -SQUASH_FILE="/mnt/vast/gharunner/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +SQUASH_FILE="/mnt/vast/gharunner/squash/$(sanitize_image_filename "$IMAGE").sqsh" LOCK_FILE="${SQUASH_FILE}.lock" set -x diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh index 602664a09..f95816448 100644 --- a/runners/launch_h100-dgxc-slurm.sh +++ b/runners/launch_h100-dgxc-slurm.sh @@ -56,7 +56,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then elif [[ $FRAMEWORK == "dynamo-trt" ]]; then # TRT-LLM container mapping - convert IMAGE to srt-slurm format (nvcr.io/ -> nvcr.io#) CONTAINER_KEY=$(echo "$IMAGE" | sed 's|nvcr.io/|nvcr.io#|') - SQUASH_FILE="/mnt/nfs/sa-shared/containers/$(echo "$IMAGE" | sed 's|nvcr.io/||' | sed 's/[\/:@#]/+/g').sqsh" + SQUASH_FILE="/mnt/nfs/sa-shared/containers/$(sanitize_image_filename "${IMAGE#nvcr.io/}" +).sqsh" fi export ISL="$ISL" @@ -249,7 +249,7 @@ EOF else HF_HUB_CACHE_MOUNT="/mnt/nfs/sa-shared/gharunners/hf-hub-cache/" - SQUASH_FILE="/mnt/nfs/lustre/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" + SQUASH_FILE="/mnt/nfs/lustre/containers/$(sanitize_image_filename "$IMAGE").sqsh" salloc --exclude="$SLURM_EXCLUDED_NODELIST" --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh index 84b40480c..08bbbc757 100644 --- a/runners/launch_h200-cw.sh +++ b/runners/launch_h200-cw.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash +source "$(dirname "$0")/../benchmarks/benchmark_lib.sh" + export HF_HUB_CACHE_MOUNT="/mnt/vast/gharunner/hf-hub-cache" export PORT=8888 @@ -8,7 +10,7 @@ FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') PARTITION="h200" -SQUASH_FILE="/mnt/vast/gharunner/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +SQUASH_FILE="/mnt/vast/gharunner/squash/$(sanitize_image_filename "$IMAGE").sqsh" LOCK_FILE="${SQUASH_FILE}.lock" set -x diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh index b61cbb0bf..71a64025f 100755 --- a/runners/launch_h200-dgxc-slurm.sh +++ b/runners/launch_h200-dgxc-slurm.sh @@ -44,12 +44,12 @@ if [[ "$IS_MULTINODE" == "true" ]]; then if [[ $FRAMEWORK == "dynamo-sglang" ]]; then # SGLang container mapping - SQUASH_FILE="/data/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/+/g').sqsh" + SQUASH_FILE="/data/containers/$(sanitize_image_filename "$IMAGE" +).sqsh" CONTAINER_KEY="$IMAGE" elif [[ $FRAMEWORK == "dynamo-trt" ]]; then # TRT-LLM container mapping - convert IMAGE to srt-slurm format (nvcr.io/ -> nvcr.io#) CONTAINER_KEY=$(echo "$IMAGE" | sed 's|nvcr.io/|nvcr.io#|') - SQUASH_FILE="/data/containers/$(echo "$IMAGE" | sed 's|nvcr.io/||' | sed 's/[\/:@#]/+/g').sqsh" + SQUASH_FILE="/data/containers/$(sanitize_image_filename "${IMAGE#nvcr.io/}" +).sqsh" fi export ISL="$ISL" @@ -242,7 +242,7 @@ EOF else HF_HUB_CACHE_MOUNT="/models/gharunners/hf-hub-cache" - SQUASH_FILE="/data/gharunners/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" + SQUASH_FILE="/data/gharunners/containers/$(sanitize_image_filename "$IMAGE").sqsh" # Convert pyxis image format (nvcr.io#path) to docker format (nvcr.io/path) for enroot import DOCKER_IMAGE=$(echo "$IMAGE" | sed 's/#/\//g') diff --git a/runners/launch_h200-nb.sh b/runners/launch_h200-nb.sh index 9d157a858..849f73699 100644 --- a/runners/launch_h200-nb.sh +++ b/runners/launch_h200-nb.sh @@ -1,5 +1,7 @@ #!/usr/bin/bash +source "$(dirname "$0")/../benchmarks/benchmark_lib.sh" + export HF_HUB_CACHE_MOUNT="/mnt/data/gharunners/hf-hub-cache/" export PORT=8888 @@ -12,7 +14,7 @@ PARTITION="main" set -x srun --partition=$PARTITION --gres=gpu:$TP --exclusive --job-name="$RUNNER_NAME" \ --container-image=$IMAGE \ ---container-name=$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')-${USER} \ +--container-name=$(sanitize_image_filename "$IMAGE")-${USER} \ --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --container-remap-root \ --container-writable \ diff --git a/runners/launch_mi300x-amds.sh b/runners/launch_mi300x-amds.sh index b654c515a..da98f3015 100644 --- a/runners/launch_mi300x-amds.sh +++ b/runners/launch_mi300x-amds.sh @@ -1,10 +1,12 @@ #!/usr/bin/env bash +source "$(dirname "$0")/../benchmarks/benchmark_lib.sh" + export HF_HUB_CACHE_MOUNT="/raid/hf-hub-cache/" export PORT=8888 PARTITION="compute" -SQUASH_FILE="/home/gharunner/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +SQUASH_FILE="/home/gharunner/gharunners/squash/$(sanitize_image_filename "$IMAGE").sqsh" LOCK_FILE="${SQUASH_FILE}.lock" set -x diff --git a/runners/launch_mi325x-amds.sh b/runners/launch_mi325x-amds.sh index 67f93a309..200b46838 100644 --- a/runners/launch_mi325x-amds.sh +++ b/runners/launch_mi325x-amds.sh @@ -1,10 +1,12 @@ #!/usr/bin/env bash +source "$(dirname "$0")/../benchmarks/benchmark_lib.sh" + export HF_HUB_CACHE_MOUNT="/nfsdata/sa/gharunner/gharunners/hf-hub-cache/" export PORT=8888 PARTITION="compute" -SQUASH_FILE="/nfsdata/sa/gharunner/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +SQUASH_FILE="/nfsdata/sa/gharunner/gharunners/squash/$(sanitize_image_filename "$IMAGE").sqsh" LOCK_FILE="${SQUASH_FILE}.lock" set -x diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index 152745d4e..a14cfdb2c 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash +source "$(dirname "$0")/../benchmarks/benchmark_lib.sh" + scancel_sync() { local jobid=$1 local timeout=${2:-600} @@ -182,7 +184,7 @@ else SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') PARTITION="compute" - SQUASH_FILE="/var/lib/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" + SQUASH_FILE="/var/lib/squash/$(sanitize_image_filename "$IMAGE").sqsh" LOCK_FILE="${SQUASH_FILE}.lock" set -x From 77de8570d26edfbac0e4cdbcca75ee11be4211c4 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 28 Apr 2026 10:54:32 -0500 Subject: [PATCH 04/16] srt-slurm: reorganize recipes by model/framework/hw/seq-len/topology Restructure benchmarks/multi_node/srt-slurm-recipes/ from the upstream's heterogeneous layout into a uniform tree: //-/////.yaml so a sweep contributor can navigate to dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ and immediately see every recipe that fits that cell. The 3 sglang multi-override files that span both stp and mtp are parked one level shallower (no trailing stp|mtp/), since the override section selects spec. 365 files moved, 388 active + 5 commented recipe references rewritten, schema validation + tests still green. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/CONFIGS.md | 4 +- .github/configs/nvidia-master.yaml | 786 +++++++++--------- .../sglang/b200-fp4/1k/1k/disagg}/1k1k.yaml | 0 .../sglang/b200-fp4/8k/1k/disagg}/8k1k.yaml | 0 .../sglang/b200-fp8/1k/1k/disagg}/1k1k.yaml | 0 .../8k/1k/disagg/mtp}/8k1k_mtp_lowlat_0.yaml | 0 .../8k/1k/disagg/mtp}/8k1k_mtp_lowlat_1.yaml | 0 .../8k/1k/disagg/mtp}/8k1k_mtp_lowlat_2.yaml | 0 .../8k/1k/disagg/mtp}/8k1k_mtp_maxtpt_0.yaml | 0 .../8k/1k/disagg/mtp}/8k1k_mtp_maxtpt_1.yaml | 0 .../8k/1k/disagg/mtp}/8k1k_mtp_maxtpt_2.yaml | 0 .../8k/1k/disagg/mtp}/8k1k_mtp_maxtpt_3.yaml | 0 .../8k/1k/disagg/stp}/8k1k_stp_lowlat_0.yaml | 0 .../8k/1k/disagg/stp}/8k1k_stp_lowlat_1.yaml | 0 .../8k/1k/disagg/stp}/8k1k_stp_lowlat_2.yaml | 0 .../8k/1k/disagg/stp}/8k1k_stp_maxtpt_0.yaml | 0 .../8k/1k/disagg/stp}/8k1k_stp_maxtpt_1.yaml | 0 .../8k/1k/disagg/stp}/8k1k_stp_maxtpt_2.yaml | 0 .../8k/1k/disagg/stp}/8k1k_stp_maxtpt_3.yaml | 0 .../1k/1k/disagg/stp}/low-latency.yaml | 0 .../gb200-fp4/1k/1k/disagg/stp}/max-tpt.yaml | 0 .../1k/1k/disagg/stp}/mid-curve.yaml | 0 .../8k/1k/disagg/stp}/low-latency.yaml | 0 .../gb200-fp4/8k/1k/disagg/stp}/max-tpt.yaml | 0 .../8k/1k/disagg/stp}/mid-curve.yaml | 0 .../1k/1k/disagg/stp}/low-latency.yaml | 0 .../gb200-fp8/1k/1k/disagg/stp}/max-tpt.yaml | 0 .../1k/1k/disagg/stp}/mid-curve.yaml | 0 .../1k/1k/disagg/stp}/ultra-tpt.yaml | 0 .../8k/1k/disagg/stp}/low-latency.yaml | 0 .../gb200-fp8/8k/1k/disagg/stp}/max_tpt.yaml | 0 .../8k/1k/disagg/stp}/mid-curve.yaml | 0 .../1k/1k/disagg/stp}/low_latency.yaml | 0 .../gb300-fp4/1k/1k/disagg/stp}/max_tpt.yaml | 0 .../1k/1k/disagg/stp}/mid_curve.yaml | 0 .../8k/1k/disagg/stp}/low_latency.yaml | 0 .../gb300-fp4/8k/1k/disagg/stp}/max_tpt.yaml | 0 .../8k/1k/disagg/stp}/mid_curve.yaml | 0 .../1k/1k/disagg}/stp/low-latency.yaml | 0 .../gb300-fp8/1k/1k/disagg}/stp/max.yaml | 0 .../gb300-fp8/1k/1k/disagg}/stp/mid.yaml | 0 .../8k/1k/disagg}/stp/low-latency.yaml | 0 .../gb300-fp8/8k/1k/disagg}/stp/max.yaml | 0 .../gb300-fp8/8k/1k/disagg}/stp/mid.yaml | 0 .../mtp/h100-fp8-1p1d-max-dep-mtp.yaml | 0 .../disagg}/mtp/h100-fp8-1p2d-max-tp-mtp.yaml | 0 .../1k/disagg}/stp/h100-fp8-1p1d-max-dep.yaml | 0 .../1k/disagg}/stp/h100-fp8-1p2d-max-tp.yaml | 0 .../mtp/h100-fp8-1p1d-max-dep-mtp.yaml | 0 .../disagg}/mtp/h100-fp8-1p1d-max-tp-mtp.yaml | 0 .../1k/disagg}/stp/h100-fp8-1p1d-max-dep.yaml | 0 .../1k/disagg}/stp/h100-fp8-1p1d-max-tp.yaml | 0 .../1k/1k/disagg/mtp}/bs256-1p6d-dep-mtp.yaml | 0 .../1k/1k/disagg/mtp}/bs256-1p6d-tp-mtp.yaml | 0 .../1k/disagg/mtp}/low-latency-1p9d-mtp.yaml | 0 .../1k/1k/disagg/stp}/bs256-1p6d-dep.yaml | 0 .../1k/1k/disagg/stp}/bs256-1p6d-tp.yaml | 0 .../1k/1k/disagg/stp}/low-latency-1p9d.yaml | 0 .../8k/1k/disagg/mtp}/bs128-1p1d-dep-mtp.yaml | 0 .../8k/1k/disagg/mtp}/bs16-1p3d-mtp.yaml | 0 .../8k/1k/disagg/mtp}/bs4-1p7d-mtp.yaml | 0 .../8k/1k/disagg/mtp}/bs64-2p3d-mtp.yaml | 0 .../8k/1k/disagg/mtp}/bs8-1p6d-mtp.yaml | 0 .../8k/1k/disagg/stp}/bs128-1p1d-dep.yaml | 0 .../h200-fp8/8k/1k/disagg/stp}/bs16-1p3d.yaml | 0 .../h200-fp8/8k/1k/disagg/stp}/bs4-1p7d.yaml | 0 .../h200-fp8/8k/1k/disagg/stp}/bs64-2p3d.yaml | 0 .../h200-fp8/8k/1k/disagg/stp}/bs8-1p6d.yaml | 0 .../ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml | 0 .../ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml | 0 .../mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml | 0 .../ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml | 0 .../ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml | 0 .../ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml | 0 .../ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml | 0 .../ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml | 0 .../ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml | 0 .../stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml | 0 .../ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml | 0 .../ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml | 0 .../mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml | 0 .../ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml | 0 .../mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml | 0 .../mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml | 0 .../ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml | 0 .../ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml | 0 .../ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml | 0 .../stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml | 0 .../stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml | 0 .../ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml | 0 .../ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml | 0 .../ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml | 0 .../ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml | 0 ...x1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml | 0 ...x1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml | 0 ...x1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml | 0 ...tx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml | 0 .../ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml | 0 .../ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml | 0 .../ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml | 0 .../ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml | 0 ...x1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml | 0 ...tx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml | 0 ...ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml | 0 .../ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml | 0 ...tx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml | 0 ...x2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml | 0 .../ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml | 0 .../ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml | 0 .../ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml | 0 .../ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml | 0 ...ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml | 0 .../ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml | 0 ...x4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml | 0 ...tx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml | 0 ...tx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml | 0 .../ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml | 0 ...ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml | 0 .../ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml | 0 .../ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml | 0 .../ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml | 0 ...tx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml | 0 .../ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml | 0 .../ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml | 0 .../mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml | 0 .../ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml | 0 .../ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml | 0 .../ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml | 0 .../ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml | 0 .../ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml | 0 .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml | 0 .../stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml | 0 .../ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml | 0 .../ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml | 0 .../ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml | 0 .../ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml | 0 .../ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml | 0 .../mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml | 0 .../mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml | 0 .../mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml | 0 .../ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml | 0 .../ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml | 0 .../ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml | 0 .../ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml | 0 .../stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml | 0 .../stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml | 0 .../ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml | 0 .../ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml | 0 .../ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml | 0 ...tx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml | 0 ...x1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml | 0 ...ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml | 0 .../ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml | 0 .../ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml | 0 ...x3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml | 0 ...x1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml | 0 ...tx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml | 0 ...ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml | 0 .../ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml | 0 .../ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml | 0 .../ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml | 0 ...2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml | 0 .../ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml | 0 .../ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml | 0 .../ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml | 0 .../ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml | 0 .../ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml | 0 .../ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml | 0 .../ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml | 0 .../ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml | 0 .../ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml | 0 .../ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml | 0 .../ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml | 0 ...tx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml | 0 ...x7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml | 0 .../ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml | 0 .../mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml | 0 ...ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml | 0 .../ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml | 0 .../ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml | 0 .../ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml | 0 .../ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml | 0 .../ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml | 0 .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml | 0 .../ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml | 0 ...ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml | 0 .../ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml | 0 ...tx11_gen1_dep16_batch256_eplb256_mtp1.yaml | 0 .../mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml | 0 .../ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml | 0 .../ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml | 0 .../ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml | 0 ...tx10_gen1_dep16_batch256_eplb256_mtp0.yaml | 0 .../ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml | 0 .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml | 0 .../ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml | 0 .../ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml | 0 .../ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml | 0 ...x1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml | 0 ...tx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml | 0 ...x1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml | 0 ...x1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml | 0 .../ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml | 0 .../ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml | 0 .../ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml | 0 ...1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml | 0 ...x1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml | 0 ...ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml | 0 ...x1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml | 0 ...x1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml | 0 .../ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml | 0 .../ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml | 0 .../ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml | 0 .../ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml | 0 .../ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml | 0 ...tx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml | 0 ...ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml | 0 ...ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml | 0 ...tx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml | 0 .../ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml | 0 .../ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml | 0 .../ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml | 0 ...ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml | 0 ...tx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml | 0 ...tx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml | 0 ...x5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml | 0 .../mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml | 0 .../ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml | 0 .../mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml | 0 .../mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml | 0 ...ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml | 0 .../ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml | 0 .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml | 0 .../ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml | 0 .../ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml | 0 .../ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml | 0 ...ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml | 0 .../ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml | 0 .../ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml | 0 .../ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml | 0 ...ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml | 0 .../mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml | 0 .../mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml | 0 .../mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml | 0 .../ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml | 0 .../ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml | 0 .../ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml | 0 .../ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml | 0 .../ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml | 0 .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml | 0 .../stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml | 0 .../stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml | 0 .../ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml | 0 .../ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml | 0 ...tx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml | 0 ...ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml | 0 .../ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml | 0 .../ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml | 0 ...2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml | 0 ...tx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml | 0 ...x3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml | 0 .../ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml | 0 .../ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml | 0 .../ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml | 0 ...2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml | 0 ...x2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml | 0 ...x3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml | 0 ...3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml | 0 ...10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml | 0 .../ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml | 0 .../ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml | 0 ...ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml | 0 ...x7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml | 0 ...tx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml | 0 .../ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml | 0 .../ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml | 0 .../ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml | 0 ...tx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml | 0 ...tx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml | 0 ...x7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml | 0 ...x7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml | 0 .../ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml | 0 .../ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml | 0 .../ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml | 0 .../ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml | 0 .../ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml | 0 .../ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml | 0 .../ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml | 0 .../ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml | 0 .../ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml | 0 .../ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml | 0 .../ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml | 0 .../ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml | 0 .../ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml | 0 .../ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml | 0 .../ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml | 0 .../ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml | 0 .../ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml | 0 .../ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml | 0 .../ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml | 0 .../ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml | 0 .../ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml | 0 .../ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml | 0 .../ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml | 0 .../ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml | 0 .../ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml | 0 .../ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml | 0 .../ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml | 0 .../ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml | 0 .../ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml | 0 ...28_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml | 0 ...16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml | 0 .../c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml | 0 ...56_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml | 0 ...2_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml | 0 ...4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml | 0 ...12_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml | 0 ...64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml | 0 ...8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml | 0 ...28_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml | 0 ...16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml | 0 .../c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml | 0 ...56_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml | 0 ...32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml | 0 ...c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml | 0 ...12_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml | 0 ...64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml | 0 ...c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml | 0 ...128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml | 0 ...c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml | 0 .../c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml | 0 ...256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml | 0 ...c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml | 0 .../c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml | 0 ...512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml | 0 ...c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml | 0 .../c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml | 0 ...28_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml | 0 ...c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml | 0 .../c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml | 0 ...56_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml | 0 ...32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml | 0 .../c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml | 0 ...12_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml | 0 ...64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml | 0 .../c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml | 0 .../stp}/disagg-gb200-1p1d-dep8-dep16.yaml | 0 .../stp}/disagg-gb200-1p1d-dep8-tep8.yaml | 0 .../stp}/disagg-gb200-3p1d-dep8-dep16.yaml | 0 .../stp}/disagg-gb200-1p1d-dep8-tep8.yaml | 0 .../stp}/disagg-gb200-3p1d-dep8-dep16.yaml | 0 .../stp}/disagg-gb200-7p1d-dep8-dep16.yaml | 0 ...ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml | 0 ...ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml | 0 ..._gen1dep8_batch768_allconc_eplb0_mtp0.yaml | 0 ..._gen4tep8_batch128_allconc_eplb0_mtp0.yaml | 0 ...p4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml | 0 ...tx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml | 0 ...tx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml | 0 ...4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml | 0 ...p4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml | 0 ...4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml | 0 ...ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml | 0 ...ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml | 0 ..._gen1dep8_batch256_allconc_eplb0_mtp0.yaml | 0 ...tx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml | 0 .../stp}/disagg-gb200-1p1d-dep4-dep16.yaml | 0 .../stp}/disagg-gb200-1p4d-dep4-tep4.yaml | 0 .../stp}/disagg-gb200-1p4d-dep4-tep4.yaml | 0 .../stp}/disagg-gb200-3p1d-dep4-dep16.yaml | 0 .../stp}/disagg-gb200-5p1d-dep4-dep8.yaml | 0 .../stp}/disagg-gb200-6p1d-dep4-dep16.yaml | 0 372 files changed, 395 insertions(+), 395 deletions(-) rename benchmarks/multi_node/srt-slurm-recipes/{b200-fp4 => dsr1/sglang/b200-fp4/1k/1k/disagg}/1k1k.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{b200-fp4 => dsr1/sglang/b200-fp4/8k/1k/disagg}/8k1k.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{b200-fp8 => dsr1/sglang/b200-fp8/1k/1k/disagg}/1k1k.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{b200-fp8 => dsr1/sglang/b200-fp8/8k/1k/disagg/mtp}/8k1k_mtp_lowlat_0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{b200-fp8 => dsr1/sglang/b200-fp8/8k/1k/disagg/mtp}/8k1k_mtp_lowlat_1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{b200-fp8 => dsr1/sglang/b200-fp8/8k/1k/disagg/mtp}/8k1k_mtp_lowlat_2.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{b200-fp8 => dsr1/sglang/b200-fp8/8k/1k/disagg/mtp}/8k1k_mtp_maxtpt_0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{b200-fp8 => dsr1/sglang/b200-fp8/8k/1k/disagg/mtp}/8k1k_mtp_maxtpt_1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{b200-fp8 => dsr1/sglang/b200-fp8/8k/1k/disagg/mtp}/8k1k_mtp_maxtpt_2.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{b200-fp8 => dsr1/sglang/b200-fp8/8k/1k/disagg/mtp}/8k1k_mtp_maxtpt_3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{b200-fp8 => dsr1/sglang/b200-fp8/8k/1k/disagg/stp}/8k1k_stp_lowlat_0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{b200-fp8 => dsr1/sglang/b200-fp8/8k/1k/disagg/stp}/8k1k_stp_lowlat_1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{b200-fp8 => dsr1/sglang/b200-fp8/8k/1k/disagg/stp}/8k1k_stp_lowlat_2.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{b200-fp8 => dsr1/sglang/b200-fp8/8k/1k/disagg/stp}/8k1k_stp_maxtpt_0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{b200-fp8 => dsr1/sglang/b200-fp8/8k/1k/disagg/stp}/8k1k_stp_maxtpt_1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{b200-fp8 => dsr1/sglang/b200-fp8/8k/1k/disagg/stp}/8k1k_stp_maxtpt_2.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{b200-fp8 => dsr1/sglang/b200-fp8/8k/1k/disagg/stp}/8k1k_stp_maxtpt_3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{gb200-fp4/1k1k => dsr1/sglang/gb200-fp4/1k/1k/disagg/stp}/low-latency.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{gb200-fp4/1k1k => dsr1/sglang/gb200-fp4/1k/1k/disagg/stp}/max-tpt.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{gb200-fp4/1k1k => dsr1/sglang/gb200-fp4/1k/1k/disagg/stp}/mid-curve.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{gb200-fp4/8k1k => dsr1/sglang/gb200-fp4/8k/1k/disagg/stp}/low-latency.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{gb200-fp4/8k1k => dsr1/sglang/gb200-fp4/8k/1k/disagg/stp}/max-tpt.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{gb200-fp4/8k1k => dsr1/sglang/gb200-fp4/8k/1k/disagg/stp}/mid-curve.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{gb200-fp8/1k1k => dsr1/sglang/gb200-fp8/1k/1k/disagg/stp}/low-latency.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{gb200-fp8/1k1k => dsr1/sglang/gb200-fp8/1k/1k/disagg/stp}/max-tpt.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{gb200-fp8/1k1k => dsr1/sglang/gb200-fp8/1k/1k/disagg/stp}/mid-curve.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{gb200-fp8/1k1k => dsr1/sglang/gb200-fp8/1k/1k/disagg/stp}/ultra-tpt.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{gb200-fp8/8k1k => dsr1/sglang/gb200-fp8/8k/1k/disagg/stp}/low-latency.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{gb200-fp8/8k1k => dsr1/sglang/gb200-fp8/8k/1k/disagg/stp}/max_tpt.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{gb200-fp8/8k1k => dsr1/sglang/gb200-fp8/8k/1k/disagg/stp}/mid-curve.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{gb300-fp4/1k1k => dsr1/sglang/gb300-fp4/1k/1k/disagg/stp}/low_latency.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{gb300-fp4/1k1k => dsr1/sglang/gb300-fp4/1k/1k/disagg/stp}/max_tpt.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{gb300-fp4/1k1k => dsr1/sglang/gb300-fp4/1k/1k/disagg/stp}/mid_curve.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{gb300-fp4/8k1k => dsr1/sglang/gb300-fp4/8k/1k/disagg/stp}/low_latency.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{gb300-fp4/8k1k => dsr1/sglang/gb300-fp4/8k/1k/disagg/stp}/max_tpt.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{gb300-fp4/8k1k => dsr1/sglang/gb300-fp4/8k/1k/disagg/stp}/mid_curve.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{gb300-fp8/1k1k => dsr1/sglang/gb300-fp8/1k/1k/disagg}/stp/low-latency.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{gb300-fp8/1k1k => dsr1/sglang/gb300-fp8/1k/1k/disagg}/stp/max.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{gb300-fp8/1k1k => dsr1/sglang/gb300-fp8/1k/1k/disagg}/stp/mid.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{gb300-fp8/8k1k => dsr1/sglang/gb300-fp8/8k/1k/disagg}/stp/low-latency.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{gb300-fp8/8k1k => dsr1/sglang/gb300-fp8/8k/1k/disagg}/stp/max.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{gb300-fp8/8k1k => dsr1/sglang/gb300-fp8/8k/1k/disagg}/stp/mid.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{h100/1k1k => dsr1/sglang/h100-fp8/1k/1k/disagg}/mtp/h100-fp8-1p1d-max-dep-mtp.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{h100/1k1k => dsr1/sglang/h100-fp8/1k/1k/disagg}/mtp/h100-fp8-1p2d-max-tp-mtp.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{h100/1k1k => dsr1/sglang/h100-fp8/1k/1k/disagg}/stp/h100-fp8-1p1d-max-dep.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{h100/1k1k => dsr1/sglang/h100-fp8/1k/1k/disagg}/stp/h100-fp8-1p2d-max-tp.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{h100/8k1k => dsr1/sglang/h100-fp8/8k/1k/disagg}/mtp/h100-fp8-1p1d-max-dep-mtp.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{h100/8k1k => dsr1/sglang/h100-fp8/8k/1k/disagg}/mtp/h100-fp8-1p1d-max-tp-mtp.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{h100/8k1k => dsr1/sglang/h100-fp8/8k/1k/disagg}/stp/h100-fp8-1p1d-max-dep.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{h100/8k1k => dsr1/sglang/h100-fp8/8k/1k/disagg}/stp/h100-fp8-1p1d-max-tp.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{h200/1k1k => dsr1/sglang/h200-fp8/1k/1k/disagg/mtp}/bs256-1p6d-dep-mtp.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{h200/1k1k => dsr1/sglang/h200-fp8/1k/1k/disagg/mtp}/bs256-1p6d-tp-mtp.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{h200/1k1k => dsr1/sglang/h200-fp8/1k/1k/disagg/mtp}/low-latency-1p9d-mtp.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{h200/1k1k => dsr1/sglang/h200-fp8/1k/1k/disagg/stp}/bs256-1p6d-dep.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{h200/1k1k => dsr1/sglang/h200-fp8/1k/1k/disagg/stp}/bs256-1p6d-tp.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{h200/1k1k => dsr1/sglang/h200-fp8/1k/1k/disagg/stp}/low-latency-1p9d.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{h200/8k1k => dsr1/sglang/h200-fp8/8k/1k/disagg/mtp}/bs128-1p1d-dep-mtp.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{h200/8k1k => dsr1/sglang/h200-fp8/8k/1k/disagg/mtp}/bs16-1p3d-mtp.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{h200/8k1k => dsr1/sglang/h200-fp8/8k/1k/disagg/mtp}/bs4-1p7d-mtp.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{h200/8k1k => dsr1/sglang/h200-fp8/8k/1k/disagg/mtp}/bs64-2p3d-mtp.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{h200/8k1k => dsr1/sglang/h200-fp8/8k/1k/disagg/mtp}/bs8-1p6d-mtp.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{h200/8k1k => dsr1/sglang/h200-fp8/8k/1k/disagg/stp}/bs128-1p1d-dep.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{h200/8k1k => dsr1/sglang/h200-fp8/8k/1k/disagg/stp}/bs16-1p3d.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{h200/8k1k => dsr1/sglang/h200-fp8/8k/1k/disagg/stp}/bs4-1p7d.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{h200/8k1k => dsr1/sglang/h200-fp8/8k/1k/disagg/stp}/bs64-2p3d.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{h200/8k1k => dsr1/sglang/h200-fp8/8k/1k/disagg/stp}/bs8-1p6d.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/1k1k => dsr1/trtllm/b200-fp4/1k/1k/disagg}/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/1k1k => dsr1/trtllm/b200-fp4/1k/1k/disagg}/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/1k1k => dsr1/trtllm/b200-fp4/1k/1k/disagg}/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/1k1k => dsr1/trtllm/b200-fp4/1k/1k/disagg}/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/1k1k => dsr1/trtllm/b200-fp4/1k/1k/disagg}/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/1k1k => dsr1/trtllm/b200-fp4/1k/1k/disagg}/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/1k1k => dsr1/trtllm/b200-fp4/1k/1k/disagg}/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/1k1k => dsr1/trtllm/b200-fp4/1k/1k/disagg}/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/1k1k => dsr1/trtllm/b200-fp4/1k/1k/disagg}/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/1k1k => dsr1/trtllm/b200-fp4/1k/1k/disagg}/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/1k1k => dsr1/trtllm/b200-fp4/1k/1k/disagg}/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/1k1k => dsr1/trtllm/b200-fp4/1k/1k/disagg}/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/8k1k => dsr1/trtllm/b200-fp4/8k/1k/disagg}/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/8k1k => dsr1/trtllm/b200-fp4/8k/1k/disagg}/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/8k1k => dsr1/trtllm/b200-fp4/8k/1k/disagg}/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/8k1k => dsr1/trtllm/b200-fp4/8k/1k/disagg}/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/8k1k => dsr1/trtllm/b200-fp4/8k/1k/disagg}/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/8k1k => dsr1/trtllm/b200-fp4/8k/1k/disagg}/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/8k1k => dsr1/trtllm/b200-fp4/8k/1k/disagg}/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/8k1k => dsr1/trtllm/b200-fp4/8k/1k/disagg}/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/8k1k => dsr1/trtllm/b200-fp4/8k/1k/disagg}/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/8k1k => dsr1/trtllm/b200-fp4/8k/1k/disagg}/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/8k1k => dsr1/trtllm/b200-fp4/8k/1k/disagg}/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/8k1k => dsr1/trtllm/b200-fp4/8k/1k/disagg}/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/8k1k => dsr1/trtllm/b200-fp4/8k/1k/disagg}/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/1k1k => dsr1/trtllm/b200-fp8/1k/1k/disagg}/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/1k1k => dsr1/trtllm/b200-fp8/1k/1k/disagg}/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/1k1k => dsr1/trtllm/b200-fp8/1k/1k/disagg}/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/1k1k => dsr1/trtllm/b200-fp8/1k/1k/disagg}/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/1k1k => dsr1/trtllm/b200-fp8/1k/1k/disagg}/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/1k1k => dsr1/trtllm/b200-fp8/1k/1k/disagg}/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/1k1k => dsr1/trtllm/b200-fp8/1k/1k/disagg}/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/1k1k => dsr1/trtllm/b200-fp8/1k/1k/disagg}/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/1k1k => dsr1/trtllm/b200-fp8/1k/1k/disagg}/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/1k1k => dsr1/trtllm/b200-fp8/1k/1k/disagg}/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/1k1k => dsr1/trtllm/b200-fp8/1k/1k/disagg}/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/1k1k => dsr1/trtllm/b200-fp8/1k/1k/disagg}/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/1k1k => dsr1/trtllm/b200-fp8/1k/1k/disagg}/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/1k1k => dsr1/trtllm/b200-fp8/1k/1k/disagg}/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/8k1k => dsr1/trtllm/b200-fp8/8k/1k/disagg}/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/8k1k => dsr1/trtllm/b200-fp8/8k/1k/disagg}/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/8k1k => dsr1/trtllm/b200-fp8/8k/1k/disagg}/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/8k1k => dsr1/trtllm/b200-fp8/8k/1k/disagg}/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/8k1k => dsr1/trtllm/b200-fp8/8k/1k/disagg}/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/8k1k => dsr1/trtllm/b200-fp8/8k/1k/disagg}/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/8k1k => dsr1/trtllm/b200-fp8/8k/1k/disagg}/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/8k1k => dsr1/trtllm/b200-fp8/8k/1k/disagg}/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/8k1k => dsr1/trtllm/b200-fp8/8k/1k/disagg}/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/8k1k => dsr1/trtllm/b200-fp8/8k/1k/disagg}/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/8k1k => dsr1/trtllm/b200-fp8/8k/1k/disagg}/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/8k1k => dsr1/trtllm/b200-fp8/8k/1k/disagg}/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/8k1k => dsr1/trtllm/b200-fp8/8k/1k/disagg}/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/8k1k => dsr1/trtllm/b200-fp8/8k/1k/disagg}/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/8k1k => dsr1/trtllm/b200-fp8/8k/1k/disagg}/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/1k1k => dsr1/trtllm/b300-fp4/1k/1k/disagg}/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/1k1k => dsr1/trtllm/b300-fp4/1k/1k/disagg}/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/1k1k => dsr1/trtllm/b300-fp4/1k/1k/disagg}/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/1k1k => dsr1/trtllm/b300-fp4/1k/1k/disagg}/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/1k1k => dsr1/trtllm/b300-fp4/1k/1k/disagg}/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/1k1k => dsr1/trtllm/b300-fp4/1k/1k/disagg}/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/1k1k => dsr1/trtllm/b300-fp4/1k/1k/disagg}/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/1k1k => dsr1/trtllm/b300-fp4/1k/1k/disagg}/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/1k1k => dsr1/trtllm/b300-fp4/1k/1k/disagg}/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/1k1k => dsr1/trtllm/b300-fp4/1k/1k/disagg}/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/1k1k => dsr1/trtllm/b300-fp4/1k/1k/disagg}/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/1k1k => dsr1/trtllm/b300-fp4/1k/1k/disagg}/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/1k1k => dsr1/trtllm/b300-fp4/1k/1k/disagg}/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/1k1k => dsr1/trtllm/b300-fp4/1k/1k/disagg}/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/8k1k => dsr1/trtllm/b300-fp4/8k/1k/disagg}/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/8k1k => dsr1/trtllm/b300-fp4/8k/1k/disagg}/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/8k1k => dsr1/trtllm/b300-fp4/8k/1k/disagg}/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/8k1k => dsr1/trtllm/b300-fp4/8k/1k/disagg}/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/8k1k => dsr1/trtllm/b300-fp4/8k/1k/disagg}/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/8k1k => dsr1/trtllm/b300-fp4/8k/1k/disagg}/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/8k1k => dsr1/trtllm/b300-fp4/8k/1k/disagg}/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/8k1k => dsr1/trtllm/b300-fp4/8k/1k/disagg}/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/8k1k => dsr1/trtllm/b300-fp4/8k/1k/disagg}/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/8k1k => dsr1/trtllm/b300-fp4/8k/1k/disagg}/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/8k1k => dsr1/trtllm/b300-fp4/8k/1k/disagg}/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/8k1k => dsr1/trtllm/b300-fp4/8k/1k/disagg}/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/8k1k => dsr1/trtllm/b300-fp4/8k/1k/disagg}/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/1k1k => dsr1/trtllm/b300-fp8/1k/1k/disagg}/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/1k1k => dsr1/trtllm/b300-fp8/1k/1k/disagg}/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/1k1k => dsr1/trtllm/b300-fp8/1k/1k/disagg}/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/1k1k => dsr1/trtllm/b300-fp8/1k/1k/disagg}/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/1k1k => dsr1/trtllm/b300-fp8/1k/1k/disagg}/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/1k1k => dsr1/trtllm/b300-fp8/1k/1k/disagg}/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/1k1k => dsr1/trtllm/b300-fp8/1k/1k/disagg}/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/1k1k => dsr1/trtllm/b300-fp8/1k/1k/disagg}/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/1k1k => dsr1/trtllm/b300-fp8/1k/1k/disagg}/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/1k1k => dsr1/trtllm/b300-fp8/1k/1k/disagg}/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/1k1k => dsr1/trtllm/b300-fp8/1k/1k/disagg}/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/1k1k => dsr1/trtllm/b300-fp8/1k/1k/disagg}/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/1k1k => dsr1/trtllm/b300-fp8/1k/1k/disagg}/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/8k1k => dsr1/trtllm/b300-fp8/8k/1k/disagg}/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/8k1k => dsr1/trtllm/b300-fp8/8k/1k/disagg}/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/8k1k => dsr1/trtllm/b300-fp8/8k/1k/disagg}/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/8k1k => dsr1/trtllm/b300-fp8/8k/1k/disagg}/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/8k1k => dsr1/trtllm/b300-fp8/8k/1k/disagg}/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/8k1k => dsr1/trtllm/b300-fp8/8k/1k/disagg}/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/8k1k => dsr1/trtllm/b300-fp8/8k/1k/disagg}/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/8k1k => dsr1/trtllm/b300-fp8/8k/1k/disagg}/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/8k1k => dsr1/trtllm/b300-fp8/8k/1k/disagg}/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/8k1k => dsr1/trtllm/b300-fp8/8k/1k/disagg}/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/8k1k => dsr1/trtllm/b300-fp8/8k/1k/disagg}/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/8k1k => dsr1/trtllm/b300-fp8/8k/1k/disagg}/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/8k1k => dsr1/trtllm/b300-fp8/8k/1k/disagg}/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/1k1k => dsr1/trtllm/gb200-fp4/1k/1k/disagg}/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/1k1k => dsr1/trtllm/gb200-fp4/1k/1k/disagg}/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/1k1k => dsr1/trtllm/gb200-fp4/1k/1k/disagg}/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/1k1k => dsr1/trtllm/gb200-fp4/1k/1k/disagg}/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/1k1k => dsr1/trtllm/gb200-fp4/1k/1k/disagg}/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/1k1k => dsr1/trtllm/gb200-fp4/1k/1k/disagg}/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/1k1k => dsr1/trtllm/gb200-fp4/1k/1k/disagg}/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/1k1k => dsr1/trtllm/gb200-fp4/1k/1k/disagg}/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/1k1k => dsr1/trtllm/gb200-fp4/1k/1k/disagg}/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/1k1k => dsr1/trtllm/gb200-fp4/1k/1k/disagg}/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/1k1k => dsr1/trtllm/gb200-fp4/1k/1k/disagg}/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/1k1k => dsr1/trtllm/gb200-fp4/1k/1k/disagg}/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/8k1k => dsr1/trtllm/gb200-fp4/8k/1k/disagg}/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/8k1k => dsr1/trtllm/gb200-fp4/8k/1k/disagg}/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/8k1k => dsr1/trtllm/gb200-fp4/8k/1k/disagg}/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/8k1k => dsr1/trtllm/gb200-fp4/8k/1k/disagg}/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/8k1k => dsr1/trtllm/gb200-fp4/8k/1k/disagg}/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/8k1k => dsr1/trtllm/gb200-fp4/8k/1k/disagg}/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/8k1k => dsr1/trtllm/gb200-fp4/8k/1k/disagg}/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/8k1k => dsr1/trtllm/gb200-fp4/8k/1k/disagg}/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/8k1k => dsr1/trtllm/gb200-fp4/8k/1k/disagg}/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/8k1k => dsr1/trtllm/gb200-fp4/8k/1k/disagg}/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/8k1k => dsr1/trtllm/gb200-fp4/8k/1k/disagg}/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/1k1k => dsr1/trtllm/gb200-fp8/1k/1k/disagg}/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/1k1k => dsr1/trtllm/gb200-fp8/1k/1k/disagg}/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/1k1k => dsr1/trtllm/gb200-fp8/1k/1k/disagg}/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/1k1k => dsr1/trtllm/gb200-fp8/1k/1k/disagg}/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/1k1k => dsr1/trtllm/gb200-fp8/1k/1k/disagg}/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/1k1k => dsr1/trtllm/gb200-fp8/1k/1k/disagg}/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/1k1k => dsr1/trtllm/gb200-fp8/1k/1k/disagg}/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/1k1k => dsr1/trtllm/gb200-fp8/1k/1k/disagg}/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/1k1k => dsr1/trtllm/gb200-fp8/1k/1k/disagg}/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/1k1k => dsr1/trtllm/gb200-fp8/1k/1k/disagg}/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/1k1k => dsr1/trtllm/gb200-fp8/1k/1k/disagg}/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/1k1k => dsr1/trtllm/gb200-fp8/1k/1k/disagg}/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/1k1k => dsr1/trtllm/gb200-fp8/1k/1k/disagg}/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/1k1k => dsr1/trtllm/gb200-fp8/1k/1k/disagg}/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/8k1k => dsr1/trtllm/gb200-fp8/8k/1k/disagg}/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/8k1k => dsr1/trtllm/gb200-fp8/8k/1k/disagg}/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/8k1k => dsr1/trtllm/gb200-fp8/8k/1k/disagg}/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/8k1k => dsr1/trtllm/gb200-fp8/8k/1k/disagg}/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/8k1k => dsr1/trtllm/gb200-fp8/8k/1k/disagg}/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/8k1k => dsr1/trtllm/gb200-fp8/8k/1k/disagg}/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/8k1k => dsr1/trtllm/gb200-fp8/8k/1k/disagg}/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/8k1k => dsr1/trtllm/gb200-fp8/8k/1k/disagg}/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/8k1k => dsr1/trtllm/gb200-fp8/8k/1k/disagg}/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/8k1k => dsr1/trtllm/gb200-fp8/8k/1k/disagg}/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/8k1k => dsr1/trtllm/gb200-fp8/8k/1k/disagg}/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/8k1k => dsr1/trtllm/gb200-fp8/8k/1k/disagg}/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/8k1k => dsr1/trtllm/gb200-fp8/8k/1k/disagg}/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/8k1k => dsr1/trtllm/gb200-fp8/8k/1k/disagg}/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/1k1k => dsr1/trtllm/gb300-fp4/1k/1k/disagg}/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/1k1k => dsr1/trtllm/gb300-fp4/1k/1k/disagg}/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/1k1k => dsr1/trtllm/gb300-fp4/1k/1k/disagg}/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/1k1k => dsr1/trtllm/gb300-fp4/1k/1k/disagg}/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/1k1k => dsr1/trtllm/gb300-fp4/1k/1k/disagg}/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/1k1k => dsr1/trtllm/gb300-fp4/1k/1k/disagg}/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/1k1k => dsr1/trtllm/gb300-fp4/1k/1k/disagg}/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/1k1k => dsr1/trtllm/gb300-fp4/1k/1k/disagg}/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/1k1k => dsr1/trtllm/gb300-fp4/1k/1k/disagg}/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/1k1k => dsr1/trtllm/gb300-fp4/1k/1k/disagg}/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/1k1k => dsr1/trtllm/gb300-fp4/1k/1k/disagg}/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/1k1k => dsr1/trtllm/gb300-fp4/1k/1k/disagg}/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/8k1k => dsr1/trtllm/gb300-fp4/8k/1k/disagg}/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/8k1k => dsr1/trtllm/gb300-fp4/8k/1k/disagg}/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/8k1k => dsr1/trtllm/gb300-fp4/8k/1k/disagg}/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/8k1k => dsr1/trtllm/gb300-fp4/8k/1k/disagg}/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/8k1k => dsr1/trtllm/gb300-fp4/8k/1k/disagg}/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/8k1k => dsr1/trtllm/gb300-fp4/8k/1k/disagg}/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/8k1k => dsr1/trtllm/gb300-fp4/8k/1k/disagg}/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/8k1k => dsr1/trtllm/gb300-fp4/8k/1k/disagg}/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/8k1k => dsr1/trtllm/gb300-fp4/8k/1k/disagg}/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/8k1k => dsr1/trtllm/gb300-fp4/8k/1k/disagg}/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/8k1k => dsr1/trtllm/gb300-fp4/8k/1k/disagg}/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/8k1k => dsr1/trtllm/gb300-fp4/8k/1k/disagg}/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/8k1k => dsr1/trtllm/gb300-fp4/8k/1k/disagg}/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/8k1k => dsr1/trtllm/gb300-fp4/8k/1k/disagg}/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/8k1k => dsr1/trtllm/gb300-fp4/8k/1k/disagg}/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/8k1k => dsr1/trtllm/gb300-fp4/8k/1k/disagg}/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/1k1k => dsr1/trtllm/gb300-fp8/1k/1k/disagg}/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/1k1k => dsr1/trtllm/gb300-fp8/1k/1k/disagg}/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/1k1k => dsr1/trtllm/gb300-fp8/1k/1k/disagg}/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/1k1k => dsr1/trtllm/gb300-fp8/1k/1k/disagg}/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/1k1k => dsr1/trtllm/gb300-fp8/1k/1k/disagg}/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/1k1k => dsr1/trtllm/gb300-fp8/1k/1k/disagg}/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/1k1k => dsr1/trtllm/gb300-fp8/1k/1k/disagg}/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/1k1k => dsr1/trtllm/gb300-fp8/1k/1k/disagg}/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/1k1k => dsr1/trtllm/gb300-fp8/1k/1k/disagg}/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/1k1k => dsr1/trtllm/gb300-fp8/1k/1k/disagg}/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/1k1k => dsr1/trtllm/gb300-fp8/1k/1k/disagg}/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/1k1k => dsr1/trtllm/gb300-fp8/1k/1k/disagg}/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/1k1k => dsr1/trtllm/gb300-fp8/1k/1k/disagg}/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/1k1k => dsr1/trtllm/gb300-fp8/1k/1k/disagg}/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/8k1k => dsr1/trtllm/gb300-fp8/8k/1k/disagg}/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/8k1k => dsr1/trtllm/gb300-fp8/8k/1k/disagg}/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/8k1k => dsr1/trtllm/gb300-fp8/8k/1k/disagg}/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/8k1k => dsr1/trtllm/gb300-fp8/8k/1k/disagg}/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/8k1k => dsr1/trtllm/gb300-fp8/8k/1k/disagg}/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/8k1k => dsr1/trtllm/gb300-fp8/8k/1k/disagg}/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/8k1k => dsr1/trtllm/gb300-fp8/8k/1k/disagg}/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/8k1k => dsr1/trtllm/gb300-fp8/8k/1k/disagg}/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/8k1k => dsr1/trtllm/gb300-fp8/8k/1k/disagg}/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/8k1k => dsr1/trtllm/gb300-fp8/8k/1k/disagg}/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/8k1k => dsr1/trtllm/gb300-fp8/8k/1k/disagg}/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/8k1k => dsr1/trtllm/gb300-fp8/8k/1k/disagg}/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/8k1k => dsr1/trtllm/gb300-fp8/8k/1k/disagg}/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/1k1k => dsr1/trtllm/h100-fp8/1k/1k/disagg}/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/1k1k => dsr1/trtllm/h100-fp8/1k/1k/disagg}/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/1k1k => dsr1/trtllm/h100-fp8/1k/1k/disagg}/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/1k1k => dsr1/trtllm/h100-fp8/1k/1k/disagg}/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/1k1k => dsr1/trtllm/h100-fp8/1k/1k/disagg}/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/1k1k => dsr1/trtllm/h100-fp8/1k/1k/disagg}/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/1k1k => dsr1/trtllm/h100-fp8/1k/1k/disagg}/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/1k1k => dsr1/trtllm/h100-fp8/1k/1k/disagg}/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/1k1k => dsr1/trtllm/h100-fp8/1k/1k/disagg}/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/1k1k => dsr1/trtllm/h100-fp8/1k/1k/disagg}/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/1k1k => dsr1/trtllm/h100-fp8/1k/1k/disagg}/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/1k1k => dsr1/trtllm/h100-fp8/1k/1k/disagg}/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/1k1k => dsr1/trtllm/h100-fp8/1k/1k/disagg}/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/1k1k => dsr1/trtllm/h100-fp8/1k/1k/disagg}/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/1k1k => dsr1/trtllm/h100-fp8/1k/1k/disagg}/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/1k1k => dsr1/trtllm/h100-fp8/1k/1k/disagg}/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/1k1k => dsr1/trtllm/h100-fp8/1k/1k/disagg}/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/1k1k => dsr1/trtllm/h100-fp8/1k/1k/disagg}/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/8k1k => dsr1/trtllm/h100-fp8/8k/1k/disagg}/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/8k1k => dsr1/trtllm/h100-fp8/8k/1k/disagg}/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/8k1k => dsr1/trtllm/h100-fp8/8k/1k/disagg}/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/8k1k => dsr1/trtllm/h100-fp8/8k/1k/disagg}/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/8k1k => dsr1/trtllm/h100-fp8/8k/1k/disagg}/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/8k1k => dsr1/trtllm/h100-fp8/8k/1k/disagg}/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/8k1k => dsr1/trtllm/h100-fp8/8k/1k/disagg}/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/8k1k => dsr1/trtllm/h100-fp8/8k/1k/disagg}/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/8k1k => dsr1/trtllm/h100-fp8/8k/1k/disagg}/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/8k1k => dsr1/trtllm/h100-fp8/8k/1k/disagg}/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/8k1k => dsr1/trtllm/h100-fp8/8k/1k/disagg}/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/1k1k => dsr1/trtllm/h200-fp8/1k/1k/disagg}/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/1k1k => dsr1/trtllm/h200-fp8/1k/1k/disagg}/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/1k1k => dsr1/trtllm/h200-fp8/1k/1k/disagg}/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/1k1k => dsr1/trtllm/h200-fp8/1k/1k/disagg}/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/1k1k => dsr1/trtllm/h200-fp8/1k/1k/disagg}/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/1k1k => dsr1/trtllm/h200-fp8/1k/1k/disagg}/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/1k1k => dsr1/trtllm/h200-fp8/1k/1k/disagg}/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/1k1k => dsr1/trtllm/h200-fp8/1k/1k/disagg}/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/1k1k => dsr1/trtllm/h200-fp8/1k/1k/disagg}/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/1k1k => dsr1/trtllm/h200-fp8/1k/1k/disagg}/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/1k1k => dsr1/trtllm/h200-fp8/1k/1k/disagg}/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/1k1k => dsr1/trtllm/h200-fp8/1k/1k/disagg}/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/1k1k => dsr1/trtllm/h200-fp8/1k/1k/disagg}/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/1k1k => dsr1/trtllm/h200-fp8/1k/1k/disagg}/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/1k1k => dsr1/trtllm/h200-fp8/1k/1k/disagg}/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/1k1k => dsr1/trtllm/h200-fp8/1k/1k/disagg}/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/1k1k => dsr1/trtllm/h200-fp8/1k/1k/disagg}/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/1k1k => dsr1/trtllm/h200-fp8/1k/1k/disagg}/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/8k1k => dsr1/trtllm/h200-fp8/8k/1k/disagg}/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/8k1k => dsr1/trtllm/h200-fp8/8k/1k/disagg}/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/8k1k => dsr1/trtllm/h200-fp8/8k/1k/disagg}/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/8k1k => dsr1/trtllm/h200-fp8/8k/1k/disagg}/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/8k1k => dsr1/trtllm/h200-fp8/8k/1k/disagg}/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/8k1k => dsr1/trtllm/h200-fp8/8k/1k/disagg}/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/8k1k => dsr1/trtllm/h200-fp8/8k/1k/disagg}/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/8k1k => dsr1/trtllm/h200-fp8/8k/1k/disagg}/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/8k1k => dsr1/trtllm/h200-fp8/8k/1k/disagg}/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/8k1k => dsr1/trtllm/h200-fp8/8k/1k/disagg}/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/8k1k => dsr1/trtllm/h200-fp8/8k/1k/disagg}/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/8k1k => dsr1/trtllm/h200-fp8/8k/1k/disagg}/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/8k1k => dsr1/trtllm/h200-fp8/8k/1k/disagg}/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/8k1k => dsr1/trtllm/h200-fp8/8k/1k/disagg}/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/8k1k => dsr1/trtllm/h200-fp8/8k/1k/disagg}/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/8k1k => dsr1/trtllm/h200-fp8/8k/1k/disagg}/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/8k1k => dsr1/trtllm/h200-fp8/8k/1k/disagg}/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/8k1k => dsr1/trtllm/h200-fp8/8k/1k/disagg}/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{vllm/deepseek-v4/1k1k => dsv4/vllm/gb200-fp4/1k/1k/disagg/stp}/disagg-gb200-1p1d-dep8-dep16.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{vllm/deepseek-v4/1k1k => dsv4/vllm/gb200-fp4/1k/1k/disagg/stp}/disagg-gb200-1p1d-dep8-tep8.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{vllm/deepseek-v4/1k1k => dsv4/vllm/gb200-fp4/1k/1k/disagg/stp}/disagg-gb200-3p1d-dep8-dep16.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{vllm/deepseek-v4/8k1k => dsv4/vllm/gb200-fp4/8k/1k/disagg/stp}/disagg-gb200-1p1d-dep8-tep8.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{vllm/deepseek-v4/8k1k => dsv4/vllm/gb200-fp4/8k/1k/disagg/stp}/disagg-gb200-3p1d-dep8-dep16.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{vllm/deepseek-v4/8k1k => dsv4/vllm/gb200-fp4/8k/1k/disagg/stp}/disagg-gb200-7p1d-dep8-dep16.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP => kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp}/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP => kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp}/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP => kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp}/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP => kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp}/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP => kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp}/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP => kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp}/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP => kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp}/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP => kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp}/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP => kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp}/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP => kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp}/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP => kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp}/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP => kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp}/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP => kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp}/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP => kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp}/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{vllm/kimi-k2.5/1k1k => kimik2.5/vllm/gb200-fp4/1k/1k/disagg/stp}/disagg-gb200-1p1d-dep4-dep16.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{vllm/kimi-k2.5/1k1k => kimik2.5/vllm/gb200-fp4/1k/1k/disagg/stp}/disagg-gb200-1p4d-dep4-tep4.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{vllm/kimi-k2.5/8k1k => kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp}/disagg-gb200-1p4d-dep4-tep4.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{vllm/kimi-k2.5/8k1k => kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp}/disagg-gb200-3p1d-dep4-dep16.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{vllm/kimi-k2.5/8k1k => kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp}/disagg-gb200-5p1d-dep4-dep8.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/{vllm/kimi-k2.5/8k1k => kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp}/disagg-gb200-6p1d-dep4-dep16.yaml (100%) diff --git a/.github/configs/CONFIGS.md b/.github/configs/CONFIGS.md index f383f20ba..482c9acfc 100644 --- a/.github/configs/CONFIGS.md +++ b/.github/configs/CONFIGS.md @@ -69,9 +69,9 @@ search-space: ``` - `recipe` is a path **relative to `benchmarks/multi_node/srt-slurm-recipes/`** in this repo. The schema validator rejects entries whose recipe file does not exist on disk, so adding a new entry requires upstreaming the recipe yaml here first. -- The path may carry an `:override[N]` / `:override_` suffix to select a named override section inside an sglang-style recipe yaml (e.g. `"b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[0]"`). The launcher strips this suffix before reading the file but passes the full string to `srtctl`. +- The path may carry an `:override[N]` / `:override_` suffix to select a named override section inside an sglang-style recipe yaml (e.g. `"dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml:zip_override_mtp_lowlat[0]"`). The launcher strips this suffix before reading the file but passes the full string to `srtctl`. - `recipe` is optional: multi-node entries that do *not* go through srt-slurm (e.g. dynamo-sglang aggregated topologies that drive their own bash) leave it unset. -- Recipes live under `benchmarks/multi_node/srt-slurm-recipes/` mirroring the upstream NVIDIA/srt-slurm `recipes/` layout (e.g. `trtllm/b200-fp4/...`, `vllm/deepseek-v4/...`, `gb200-fp4/...`). The benchmark template resolves `recipe` to an absolute path and passes it to the launcher as `CONFIG_FILE`, so launchers do not see the relative form. +- Recipes live under `benchmarks/multi_node/srt-slurm-recipes/` organized as `//-/////.yaml` — e.g. `dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml`. A handful of sglang-style files that carry override sections spanning both stp and mtp are parked one level shallower (the trailing `/` segment is omitted). The benchmark template resolves `recipe` to an absolute path and passes it to the launcher as `CONFIG_FILE`, so launchers do not see the relative form. ## Runners diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 4a03b1c0f..bb59f1dd0 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -13,7 +13,7 @@ dsr1-fp4-b200-dynamo-trt: search-space: - spec-decoding: "mtp" conc-list: [1214] - recipe: "trtllm/b200-fp4/1k1k/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml" + recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml" prefill: num-worker: 1 tp: 4 @@ -26,7 +26,7 @@ dsr1-fp4-b200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [875] - recipe: "trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 4 @@ -39,7 +39,7 @@ dsr1-fp4-b200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [6] - recipe: "trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 4 @@ -52,7 +52,7 @@ dsr1-fp4-b200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [10, 15, 25, 45, 90, 180] - recipe: "trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 4 @@ -65,7 +65,7 @@ dsr1-fp4-b200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [ 4968 ] - recipe: "trtllm/b200-fp4/1k1k/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml" + recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml" prefill: num-worker: 3 tp: 4 @@ -78,7 +78,7 @@ dsr1-fp4-b200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [10860] - recipe: "trtllm/b200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml" + recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml" prefill: num-worker: 3 tp: 4 @@ -92,7 +92,7 @@ dsr1-fp4-b200-dynamo-trt: # Non-MTP configurations - conc-list: [4096] - recipe: "trtllm/b200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -104,7 +104,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [2192] - recipe: "trtllm/b200-fp4/1k1k/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -116,7 +116,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [1365] - recipe: "trtllm/b200-fp4/1k1k/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -128,7 +128,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [6] - recipe: "trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -140,7 +140,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [10, 15, 25, 45, 90, 180] - recipe: "trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -152,7 +152,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [450] - recipe: "trtllm/b200-fp4/1k1k/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -169,7 +169,7 @@ dsr1-fp4-b200-dynamo-trt: search-space: - spec-decoding: "mtp" conc-list: [90] - recipe: "trtllm/b200-fp4/8k1k/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 4 @@ -182,7 +182,7 @@ dsr1-fp4-b200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [66] - recipe: "trtllm/b200-fp4/8k1k/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 4 @@ -195,7 +195,7 @@ dsr1-fp4-b200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [6] - recipe: "trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 4 @@ -208,7 +208,7 @@ dsr1-fp4-b200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [10, 15, 30, 60] - recipe: "trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 4 @@ -221,7 +221,7 @@ dsr1-fp4-b200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [548] - recipe: "trtllm/b200-fp4/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml" prefill: num-worker: 3 tp: 4 @@ -234,7 +234,7 @@ dsr1-fp4-b200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [1096, 1691] - recipe: "trtllm/b200-fp4/8k1k/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml" + recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml" prefill: num-worker: 5 tp: 4 @@ -247,7 +247,7 @@ dsr1-fp4-b200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [658] - recipe: "trtllm/b200-fp4/8k1k/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml" prefill: num-worker: 5 tp: 4 @@ -261,7 +261,7 @@ dsr1-fp4-b200-dynamo-trt: # Non-MTP configurations - conc-list: [6] - recipe: "trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -273,7 +273,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [10, 15, 25, 50, 100] - recipe: "trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -285,7 +285,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [370] - recipe: "trtllm/b200-fp4/8k1k/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 4 @@ -297,7 +297,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [1606] - recipe: "trtllm/b200-fp4/8k1k/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml" prefill: num-worker: 4 tp: 4 @@ -309,7 +309,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [837] - recipe: "trtllm/b200-fp4/8k1k/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml" prefill: num-worker: 4 tp: 4 @@ -321,7 +321,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [2222] - recipe: "trtllm/b200-fp4/8k1k/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml" prefill: num-worker: 7 tp: 4 @@ -349,7 +349,7 @@ dsr1-fp8-b200-dynamo-trt: # MTP configurations - Low latency (TP attention) - spec-decoding: "mtp" conc-list: [8] - recipe: "trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml" + recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml" prefill: num-worker: 1 tp: 8 @@ -362,7 +362,7 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [32] - recipe: "trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml" + recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml" prefill: num-worker: 1 tp: 8 @@ -375,7 +375,7 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [64] - recipe: "trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml" + recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml" prefill: num-worker: 1 tp: 8 @@ -388,7 +388,7 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [256] - recipe: "trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml" + recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml" prefill: num-worker: 1 tp: 8 @@ -402,7 +402,7 @@ dsr1-fp8-b200-dynamo-trt: # MTP configurations - High throughput (DP attention) - spec-decoding: "mtp" conc-list: [896] - recipe: "trtllm/b200-fp8/1k1k/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml" + recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml" prefill: num-worker: 1 tp: 8 @@ -415,7 +415,7 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [1024] - recipe: "trtllm/b200-fp8/1k1k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml" + recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml" prefill: num-worker: 1 tp: 8 @@ -428,7 +428,7 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [1184] - recipe: "trtllm/b200-fp8/1k1k/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml" + recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml" prefill: num-worker: 1 tp: 8 @@ -441,7 +441,7 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [1600] - recipe: "trtllm/b200-fp8/1k1k/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml" + recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml" prefill: num-worker: 1 tp: 8 @@ -455,7 +455,7 @@ dsr1-fp8-b200-dynamo-trt: # Non-MTP (STP) configurations - Low latency (TP attention) - conc-list: [4] - recipe: "trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml" + recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml" prefill: num-worker: 1 tp: 8 @@ -467,7 +467,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: false - conc-list: [32] - recipe: "trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml" + recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml" prefill: num-worker: 1 tp: 8 @@ -479,7 +479,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: false - conc-list: [128] - recipe: "trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml" + recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml" prefill: num-worker: 1 tp: 8 @@ -492,7 +492,7 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: false # Non-MTP (STP) configurations - High throughput (DP attention) - conc-list: [1920] - recipe: "trtllm/b200-fp8/1k1k/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml" + recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml" prefill: num-worker: 1 tp: 8 @@ -504,7 +504,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [4096] - recipe: "trtllm/b200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml" + recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml" prefill: num-worker: 1 tp: 8 @@ -516,7 +516,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [5152] - recipe: "trtllm/b200-fp8/1k1k/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml" + recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml" prefill: num-worker: 2 tp: 8 @@ -534,7 +534,7 @@ dsr1-fp8-b200-dynamo-trt: # MTP configurations - Low latency (TP attention) - spec-decoding: "mtp" conc-list: [8] - recipe: "trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml" + recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml" prefill: num-worker: 1 tp: 8 @@ -547,7 +547,7 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [8] - recipe: "trtllm/b200-fp8/8k1k/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml" + recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml" prefill: num-worker: 1 tp: 8 @@ -560,7 +560,7 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [48] - recipe: "trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml" + recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml" prefill: num-worker: 1 tp: 8 @@ -573,7 +573,7 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [64] - recipe: "trtllm/b200-fp8/8k1k/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml" + recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml" prefill: num-worker: 1 tp: 8 @@ -587,7 +587,7 @@ dsr1-fp8-b200-dynamo-trt: # MTP configurations - High throughput (DP attention) - spec-decoding: "mtp" conc-list: [224] - recipe: "trtllm/b200-fp8/8k1k/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml" + recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml" prefill: num-worker: 2 tp: 8 @@ -600,7 +600,7 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [288] - recipe: "trtllm/b200-fp8/8k1k/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml" + recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml" prefill: num-worker: 2 tp: 8 @@ -613,7 +613,7 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [1088] - recipe: "trtllm/b200-fp8/8k1k/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml" + recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml" prefill: num-worker: 4 tp: 8 @@ -627,7 +627,7 @@ dsr1-fp8-b200-dynamo-trt: # Non-MTP (STP) configurations - Low latency (TP attention) - conc-list: [1] - recipe: "trtllm/b200-fp8/8k1k/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml" + recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml" prefill: num-worker: 1 tp: 8 @@ -639,7 +639,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: false - conc-list: [32] - recipe: "trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml" + recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml" prefill: num-worker: 1 tp: 8 @@ -651,7 +651,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: false - conc-list: [128] - recipe: "trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml" + recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml" prefill: num-worker: 1 tp: 8 @@ -663,7 +663,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: false - conc-list: [96] - recipe: "trtllm/b200-fp8/8k1k/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml" + recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml" prefill: num-worker: 1 tp: 8 @@ -676,7 +676,7 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: false # Non-MTP (STP) configurations - High throughput (DP attention) - conc-list: [128] - recipe: "trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml" + recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml" prefill: num-worker: 1 tp: 8 @@ -688,7 +688,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [128] - recipe: "trtllm/b200-fp8/8k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml" + recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml" prefill: num-worker: 1 tp: 8 @@ -700,7 +700,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [256] - recipe: "trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml" + recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml" prefill: num-worker: 1 tp: 8 @@ -712,7 +712,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [640] - recipe: "trtllm/b200-fp8/8k1k/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml" + recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml" prefill: num-worker: 2 tp: 8 @@ -740,7 +740,7 @@ dsr1-fp4-b300-dynamo-trt: search-space: - spec-decoding: "mtp" conc-list: [654] - recipe: "trtllm/b300-fp4/1k1k/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 2 @@ -753,7 +753,7 @@ dsr1-fp4-b300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [271] - recipe: "trtllm/b300-fp4/1k1k/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 2 @@ -766,7 +766,7 @@ dsr1-fp4-b300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [11] - recipe: "trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 2 @@ -779,7 +779,7 @@ dsr1-fp4-b300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [10, 20, 25, 60, 120, 200] - recipe: "trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 2 @@ -792,7 +792,7 @@ dsr1-fp4-b300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [2342] - recipe: "trtllm/b300-fp4/1k1k/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml" + recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml" prefill: num-worker: 2 tp: 2 @@ -805,7 +805,7 @@ dsr1-fp4-b300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [8609] - recipe: "trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml" + recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml" prefill: num-worker: 5 tp: 2 @@ -818,7 +818,7 @@ dsr1-fp4-b300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [12926] - recipe: "trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml" + recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml" prefill: num-worker: 5 tp: 2 @@ -832,7 +832,7 @@ dsr1-fp4-b300-dynamo-trt: # Non-MTP configurations - conc-list: [1176] - recipe: "trtllm/b300-fp4/1k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 @@ -844,7 +844,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 8 dp-attn: true - conc-list: [6] - recipe: "trtllm/b300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 @@ -856,7 +856,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 8 dp-attn: false - conc-list: [5, 10, 15, 25] - recipe: "trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 @@ -868,7 +868,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 4 dp-attn: false - conc-list: [60, 110, 195, 395] - recipe: "trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 @@ -880,7 +880,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 8 dp-attn: false - conc-list: [4405] - recipe: "trtllm/b300-fp4/1k1k/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 2 @@ -892,7 +892,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 8 dp-attn: true - conc-list: [8192] - recipe: "trtllm/b300-fp4/1k1k/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml" prefill: num-worker: 3 tp: 2 @@ -904,7 +904,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 8 dp-attn: true - conc-list: [4611] - recipe: "trtllm/b300-fp4/1k1k/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml" prefill: num-worker: 3 tp: 2 @@ -921,7 +921,7 @@ dsr1-fp4-b300-dynamo-trt: search-space: - spec-decoding: "mtp" conc-list: [2198] - recipe: "trtllm/b300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml" + recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml" prefill: num-worker: 10 tp: 2 @@ -934,7 +934,7 @@ dsr1-fp4-b300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [52] - recipe: "trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 2 @@ -947,7 +947,7 @@ dsr1-fp4-b300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [8] - recipe: "trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 2 @@ -960,7 +960,7 @@ dsr1-fp4-b300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [32] - recipe: "trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 2 @@ -973,7 +973,7 @@ dsr1-fp4-b300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [181] - recipe: "trtllm/b300-fp4/8k1k/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml" prefill: num-worker: 3 tp: 2 @@ -986,7 +986,7 @@ dsr1-fp4-b300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [1197] - recipe: "trtllm/b300-fp4/8k1k/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml" + recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml" prefill: num-worker: 9 tp: 2 @@ -1000,7 +1000,7 @@ dsr1-fp4-b300-dynamo-trt: # Non-MTP configurations - conc-list: [105] - recipe: "trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 @@ -1012,7 +1012,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 4 dp-attn: false - conc-list: [63] - recipe: "trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 @@ -1024,7 +1024,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 8 dp-attn: false - conc-list: [4] - recipe: "trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 @@ -1036,7 +1036,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 8 dp-attn: false - conc-list: [12] - recipe: "trtllm/b300-fp4/8k1k/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 @@ -1048,7 +1048,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 4 dp-attn: false - conc-list: [589] - recipe: "trtllm/b300-fp4/8k1k/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml" prefill: num-worker: 5 tp: 2 @@ -1060,7 +1060,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 8 dp-attn: true - conc-list: [1093] - recipe: "trtllm/b300-fp4/8k1k/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml" prefill: num-worker: 6 tp: 2 @@ -1072,7 +1072,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 8 dp-attn: true - conc-list: [2048] - recipe: "trtllm/b300-fp4/8k1k/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml" prefill: num-worker: 8 tp: 2 @@ -1100,7 +1100,7 @@ dsr1-fp8-b300-dynamo-trt: search-space: - spec-decoding: "mtp" conc-list: [10] - recipe: "trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml" + recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml" prefill: num-worker: 1 tp: 4 @@ -1113,7 +1113,7 @@ dsr1-fp8-b300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [160] - recipe: "trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml" + recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml" prefill: num-worker: 1 tp: 4 @@ -1126,7 +1126,7 @@ dsr1-fp8-b300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [3072] - recipe: "trtllm/b300-fp8/1k1k/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml" + recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml" prefill: num-worker: 1 tp: 4 @@ -1139,7 +1139,7 @@ dsr1-fp8-b300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [2560] - recipe: "trtllm/b300-fp8/1k1k/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml" + recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml" prefill: num-worker: 1 tp: 4 @@ -1152,7 +1152,7 @@ dsr1-fp8-b300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [720] - recipe: "trtllm/b300-fp8/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml" + recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml" prefill: num-worker: 1 tp: 4 @@ -1165,7 +1165,7 @@ dsr1-fp8-b300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [11264] - recipe: "trtllm/b300-fp8/1k1k/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml" + recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml" prefill: num-worker: 3 tp: 4 @@ -1181,7 +1181,7 @@ dsr1-fp8-b300-dynamo-trt: osl: 1024 search-space: - conc-list: [2112] - recipe: "trtllm/b300-fp8/1k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml" + recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml" prefill: num-worker: 1 tp: 4 @@ -1193,7 +1193,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 8 dp-attn: true - conc-list: [3072] - recipe: "trtllm/b300-fp8/1k1k/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml" + recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml" prefill: num-worker: 1 tp: 4 @@ -1205,7 +1205,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true - conc-list: [1280] - recipe: "trtllm/b300-fp8/1k1k/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml" + recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml" prefill: num-worker: 1 tp: 4 @@ -1217,7 +1217,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true - conc-list: [12] - recipe: "trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml" + recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml" prefill: num-worker: 1 tp: 4 @@ -1229,7 +1229,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: false - conc-list: [128] - recipe: "trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml" + recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml" prefill: num-worker: 1 tp: 4 @@ -1241,7 +1241,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: false - conc-list: [384] - recipe: "trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml" + recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml" prefill: num-worker: 1 tp: 4 @@ -1253,7 +1253,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: false - conc-list: [16384] - recipe: "trtllm/b300-fp8/1k1k/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml" + recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml" prefill: num-worker: 2 tp: 4 @@ -1270,7 +1270,7 @@ dsr1-fp8-b300-dynamo-trt: search-space: - spec-decoding: "mtp" conc-list: [40] - recipe: "trtllm/b300-fp8/8k1k/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml" + recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml" prefill: num-worker: 1 tp: 4 @@ -1283,7 +1283,7 @@ dsr1-fp8-b300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [8] - recipe: "trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml" + recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml" prefill: num-worker: 1 tp: 4 @@ -1296,7 +1296,7 @@ dsr1-fp8-b300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [20] - recipe: "trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml" + recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml" prefill: num-worker: 1 tp: 4 @@ -1309,7 +1309,7 @@ dsr1-fp8-b300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [72] - recipe: "trtllm/b300-fp8/8k1k/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml" + recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml" prefill: num-worker: 1 tp: 4 @@ -1322,7 +1322,7 @@ dsr1-fp8-b300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [144] - recipe: "trtllm/b300-fp8/8k1k/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml" + recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml" prefill: num-worker: 2 tp: 4 @@ -1335,7 +1335,7 @@ dsr1-fp8-b300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [512] - recipe: "trtllm/b300-fp8/8k1k/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml" + recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml" prefill: num-worker: 4 tp: 4 @@ -1351,7 +1351,7 @@ dsr1-fp8-b300-dynamo-trt: osl: 1024 search-space: - conc-list: [64] - recipe: "trtllm/b300-fp8/8k1k/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml" + recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml" prefill: num-worker: 1 tp: 4 @@ -1363,7 +1363,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: false - conc-list: [16] - recipe: "trtllm/b300-fp8/8k1k/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml" + recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml" prefill: num-worker: 1 tp: 4 @@ -1375,7 +1375,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: false - conc-list: [256] - recipe: "trtllm/b300-fp8/8k1k/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml" + recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml" prefill: num-worker: 2 tp: 4 @@ -1387,7 +1387,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true - conc-list: [512] - recipe: "trtllm/b300-fp8/8k1k/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml" + recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml" prefill: num-worker: 3 tp: 4 @@ -1399,7 +1399,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true - conc-list: [256] - recipe: "trtllm/b300-fp8/8k1k/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml" + recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml" prefill: num-worker: 3 tp: 4 @@ -1411,7 +1411,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: false - conc-list: [1075] - recipe: "trtllm/b300-fp8/8k1k/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml" + recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml" prefill: num-worker: 5 tp: 4 @@ -1423,7 +1423,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true - conc-list: [3072] - recipe: "trtllm/b300-fp8/8k1k/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml" + recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml" prefill: num-worker: 7 tp: 4 @@ -2440,7 +2440,7 @@ dsr1-fp8-h200-dynamo-trt: # MTP configurations - spec-decoding: "mtp" conc-list: [1] - recipe: "trtllm/h200/1k1k/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 8 @@ -2453,7 +2453,7 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [4] - recipe: "trtllm/h200/1k1k/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 8 @@ -2466,7 +2466,7 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [8] - recipe: "trtllm/h200/1k1k/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 8 @@ -2479,7 +2479,7 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [16] - recipe: "trtllm/h200/1k1k/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 8 @@ -2492,7 +2492,7 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [32] - recipe: "trtllm/h200/1k1k/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 8 @@ -2505,7 +2505,7 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [64] - recipe: "trtllm/h200/1k1k/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 8 @@ -2518,7 +2518,7 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [128] - recipe: "trtllm/h200/1k1k/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 8 @@ -2531,7 +2531,7 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [256] - recipe: "trtllm/h200/1k1k/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 8 @@ -2544,7 +2544,7 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [512] - recipe: "trtllm/h200/1k1k/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml" + recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml" prefill: num-worker: 1 tp: 8 @@ -2557,7 +2557,7 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: true # Non-MTP configurations (STP) - conc-list: [1] - recipe: "trtllm/h200/1k1k/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 @@ -2569,7 +2569,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [4] - recipe: "trtllm/h200/1k1k/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 @@ -2581,7 +2581,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [8] - recipe: "trtllm/h200/1k1k/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 @@ -2593,7 +2593,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [16] - recipe: "trtllm/h200/1k1k/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 @@ -2605,7 +2605,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [32] - recipe: "trtllm/h200/1k1k/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 @@ -2617,7 +2617,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [64] - recipe: "trtllm/h200/1k1k/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 @@ -2629,7 +2629,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [128] - recipe: "trtllm/h200/1k1k/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 @@ -2641,7 +2641,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [256] - recipe: "trtllm/h200/1k1k/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 @@ -2653,7 +2653,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [512] - recipe: "trtllm/h200/1k1k/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 8 @@ -2670,7 +2670,7 @@ dsr1-fp8-h200-dynamo-trt: # MTP configurations - spec-decoding: "mtp" conc-list: [1] - recipe: "trtllm/h200/8k1k/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 8 @@ -2683,7 +2683,7 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [4] - recipe: "trtllm/h200/8k1k/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 8 @@ -2696,7 +2696,7 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [8] - recipe: "trtllm/h200/8k1k/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 8 @@ -2709,7 +2709,7 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [16] - recipe: "trtllm/h200/8k1k/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml" + recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml" prefill: num-worker: 1 tp: 8 @@ -2722,7 +2722,7 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [32] - recipe: "trtllm/h200/8k1k/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml" prefill: num-worker: 3 tp: 8 @@ -2735,7 +2735,7 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [64] - recipe: "trtllm/h200/8k1k/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml" + recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml" prefill: num-worker: 1 tp: 8 @@ -2748,7 +2748,7 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [128] - recipe: "trtllm/h200/8k1k/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml" + recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml" prefill: num-worker: 2 tp: 8 @@ -2761,7 +2761,7 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [256] - recipe: "trtllm/h200/8k1k/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml" + recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml" prefill: num-worker: 3 tp: 8 @@ -2774,7 +2774,7 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [512] - recipe: "trtllm/h200/8k1k/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml" + recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml" prefill: num-worker: 3 tp: 8 @@ -2787,7 +2787,7 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: true # Non-MTP configurations (STP) - conc-list: [1] - recipe: "trtllm/h200/8k1k/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 @@ -2799,7 +2799,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [4] - recipe: "trtllm/h200/8k1k/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 @@ -2811,7 +2811,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [8] - recipe: "trtllm/h200/8k1k/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 @@ -2823,7 +2823,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [16] - recipe: "trtllm/h200/8k1k/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 @@ -2835,7 +2835,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [32] - recipe: "trtllm/h200/8k1k/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 8 @@ -2847,7 +2847,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [64] - recipe: "trtllm/h200/8k1k/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 8 @@ -2859,7 +2859,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [128] - recipe: "trtllm/h200/8k1k/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 @@ -2871,7 +2871,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [256] - recipe: "trtllm/h200/8k1k/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml" prefill: num-worker: 5 tp: 8 @@ -2883,7 +2883,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [512] - recipe: "trtllm/h200/8k1k/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml" prefill: num-worker: 3 tp: 8 @@ -2911,7 +2911,7 @@ dsr1-fp8-h100-dynamo-trt: # MTP configurations - spec-decoding: "mtp" conc-list: [6] - recipe: "trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 16 @@ -2924,7 +2924,7 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [9] - recipe: "trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 16 @@ -2937,7 +2937,7 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [30] - recipe: "trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 16 @@ -2950,7 +2950,7 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [60] - recipe: "trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 16 @@ -2963,7 +2963,7 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [117] - recipe: "trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 16 @@ -2976,7 +2976,7 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [231] - recipe: "trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 16 @@ -2989,7 +2989,7 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [462] - recipe: "trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 16 @@ -3002,7 +3002,7 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [615] - recipe: "trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml" + recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml" prefill: num-worker: 1 tp: 16 @@ -3015,7 +3015,7 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [1229] - recipe: "trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml" + recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml" prefill: num-worker: 1 tp: 16 @@ -3028,7 +3028,7 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: true # Non-MTP configurations (STP) - conc-list: [6] - recipe: "trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 16 @@ -3040,7 +3040,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: false - conc-list: [9] - recipe: "trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 16 @@ -3052,7 +3052,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: false - conc-list: [30] - recipe: "trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 16 @@ -3064,7 +3064,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: false - conc-list: [60] - recipe: "trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 16 @@ -3076,7 +3076,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: false - conc-list: [231] - recipe: "trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 16 @@ -3088,7 +3088,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true - conc-list: [462] - recipe: "trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 16 @@ -3100,7 +3100,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true - conc-list: [924] - recipe: "trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 16 @@ -3112,7 +3112,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true - conc-list: [1845] - recipe: "trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 16 @@ -3124,7 +3124,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true - conc-list: [4916] - recipe: "trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 16 @@ -3141,7 +3141,7 @@ dsr1-fp8-h100-dynamo-trt: # MTP configurations (6 points) - spec-decoding: "mtp" conc-list: [6] - recipe: "trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 16 @@ -3154,7 +3154,7 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [9] - recipe: "trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 16 @@ -3167,7 +3167,7 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [30] - recipe: "trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 16 @@ -3180,7 +3180,7 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [77] - recipe: "trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 16 @@ -3195,7 +3195,7 @@ dsr1-fp8-h100-dynamo-trt: # https://github.com/InferenceMAX/InferenceMAX/actions/runs/21769314582/job/62813105509 # - spec-decoding: "mtp" # conc-list: [78] - # recipe: "trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml" + # recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml" # prefill: # num-worker: 1 # tp: 16 @@ -3208,7 +3208,7 @@ dsr1-fp8-h100-dynamo-trt: # dp-attn: false - spec-decoding: "mtp" conc-list: [154] - recipe: "trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml" prefill: num-worker: 2 tp: 16 @@ -3221,7 +3221,7 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: true # STP configurations (5 points) - conc-list: [6] - recipe: "trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 16 @@ -3233,7 +3233,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: false - conc-list: [9] - recipe: "trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 16 @@ -3245,7 +3245,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: false - conc-list: [30] - recipe: "trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 16 @@ -3257,7 +3257,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: false - conc-list: [154] - recipe: "trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 16 @@ -3269,7 +3269,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: false - conc-list: [308] - recipe: "trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 16 @@ -3494,7 +3494,7 @@ dsr1-fp8-h100-dynamo-sglang: search-space: # # STP: Max throughput TEP (1 prefill, 2 decode) # - conc-list: [1, 2, 4, 8, 16, 32, 64, 128] - # recipe: "h100/1k1k/stp/h100-fp8-1p2d-max-tp.yaml" + # recipe: "dsr1/sglang/h100-fp8/1k/1k/disagg/stp/h100-fp8-1p2d-max-tp.yaml" # prefill: # num-worker: 1 # tp: 16 @@ -3507,7 +3507,7 @@ dsr1-fp8-h100-dynamo-sglang: # dp-attn: false # # STP: Max throughput DEP (1 prefill, 1 decode, dp-attention) # - conc-list: [1, 2, 4, 8, 16, 32, 64] - # recipe: "h100/1k1k/stp/h100-fp8-1p1d-max-dep.yaml" + # recipe: "dsr1/sglang/h100-fp8/1k/1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml" # prefill: # num-worker: 1 # tp: 16 @@ -3521,7 +3521,7 @@ dsr1-fp8-h100-dynamo-sglang: # MTP: Max throughput TEP (1 prefill, 2 decode) - spec-decoding: "mtp" conc-list: [1, 2, 4, 8, 16, 32, 64, 128] - recipe: "h100/1k1k/mtp/h100-fp8-1p2d-max-tp-mtp.yaml" + recipe: "dsr1/sglang/h100-fp8/1k/1k/disagg/mtp/h100-fp8-1p2d-max-tp-mtp.yaml" prefill: num-worker: 1 tp: 16 @@ -3535,7 +3535,7 @@ dsr1-fp8-h100-dynamo-sglang: # MTP: Max throughput DEP (1 prefill, 1 decode, dp-attention) - spec-decoding: "mtp" conc-list: [1, 2, 4, 8, 16, 32, 64] - recipe: "h100/1k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml" + recipe: "dsr1/sglang/h100-fp8/1k/1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml" prefill: num-worker: 1 tp: 16 @@ -3551,7 +3551,7 @@ dsr1-fp8-h100-dynamo-sglang: search-space: # # STP: Max throughput TEP (1 prefill, 1 decode) # - conc-list: [1, 2, 4, 8, 16, 32, 64, 128] - # recipe: "h100/8k1k/stp/h100-fp8-1p1d-max-tp.yaml" + # recipe: "dsr1/sglang/h100-fp8/8k/1k/disagg/stp/h100-fp8-1p1d-max-tp.yaml" # prefill: # num-worker: 1 # tp: 16 @@ -3564,7 +3564,7 @@ dsr1-fp8-h100-dynamo-sglang: # dp-attn: false # # STP: Max throughput DEP (1 prefill, 1 decode, dp-attention) # - conc-list: [1, 2, 4, 8, 16, 32, 64] - # recipe: "h100/8k1k/stp/h100-fp8-1p1d-max-dep.yaml" + # recipe: "dsr1/sglang/h100-fp8/8k/1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml" # prefill: # num-worker: 1 # tp: 16 @@ -3578,7 +3578,7 @@ dsr1-fp8-h100-dynamo-sglang: # MTP: Max throughput TEP (1 prefill, 1 decode) - spec-decoding: "mtp" conc-list: [1, 2, 4, 8, 16, 32, 64, 128] - recipe: "h100/8k1k/mtp/h100-fp8-1p1d-max-tp-mtp.yaml" + recipe: "dsr1/sglang/h100-fp8/8k/1k/disagg/mtp/h100-fp8-1p1d-max-tp-mtp.yaml" prefill: num-worker: 1 tp: 16 @@ -3592,7 +3592,7 @@ dsr1-fp8-h100-dynamo-sglang: # MTP: Max throughput DEP (1 prefill, 1 decode, dp-attention) - spec-decoding: "mtp" conc-list: [1, 2, 4, 8, 16, 32, 64] - recipe: "h100/8k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml" + recipe: "dsr1/sglang/h100-fp8/8k/1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml" prefill: num-worker: 1 tp: 16 @@ -3687,7 +3687,7 @@ dsr1-fp4-gb200-dynamo-trt: # MTP configurations (spec_decoding="mtp") - spec-decoding: "mtp" conc-list: [ 180 ] - recipe: "trtllm/gb200-fp4/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 4 @@ -3700,7 +3700,7 @@ dsr1-fp4-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [ 4, 8, 12, 24, 48 ] - recipe: "trtllm/gb200-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 4 @@ -3713,7 +3713,7 @@ dsr1-fp4-gb200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [ 4301 ] - recipe: "trtllm/gb200-fp4/1k1k/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml" + recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml" prefill: num-worker: 2 tp: 4 @@ -3726,7 +3726,7 @@ dsr1-fp4-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [ 2253 ] - recipe: "trtllm/gb200-fp4/1k1k/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml" + recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml" prefill: num-worker: 3 tp: 4 @@ -3739,7 +3739,7 @@ dsr1-fp4-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [ 16130 ] - recipe: "trtllm/gb200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml" + recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml" prefill: num-worker: 3 tp: 4 @@ -3754,7 +3754,7 @@ dsr1-fp4-gb200-dynamo-trt: # Non-MTP configurations (default spec_decoding="none") - conc-list: [ 4301 ] - recipe: "trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -3766,7 +3766,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [ 666 ] - recipe: "trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -3778,7 +3778,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 32 dp-attn: true - conc-list: [ 6144 ] - recipe: "trtllm/gb200-fp4/1k1k/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -3790,7 +3790,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true - conc-list: [ 12, 24, 48, 96, 192 ] - recipe: "trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -3802,7 +3802,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [ 5 ] - recipe: "trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -3814,7 +3814,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [ 4301 ] - recipe: "trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml" + recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml" prefill: num-worker: 2 tp: 4 @@ -3826,7 +3826,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 16 dp-attn: true - conc-list: [ 2253 ] - recipe: "trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 4 @@ -3844,7 +3844,7 @@ dsr1-fp4-gb200-dynamo-trt: # MTP configurations (spec_decoding="mtp") - spec-decoding: "mtp" conc-list: [ 4, 8, 12, 24, 48 ] - recipe: "trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 4 @@ -3857,7 +3857,7 @@ dsr1-fp4-gb200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [ 180 ] - recipe: "trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml" prefill: num-worker: 3 tp: 4 @@ -3870,7 +3870,7 @@ dsr1-fp4-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [ 1229 ] - recipe: "trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml" + recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml" prefill: num-worker: 7 tp: 4 @@ -3883,7 +3883,7 @@ dsr1-fp4-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [ 666 ] - recipe: "trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml" prefill: num-worker: 8 tp: 4 @@ -3896,7 +3896,7 @@ dsr1-fp4-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [ 4301 ] - recipe: "trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml" + recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml" prefill: num-worker: 11 tp: 4 @@ -3910,7 +3910,7 @@ dsr1-fp4-gb200-dynamo-trt: # Non-MTP configurations (default spec_decoding="none") - conc-list: [ 12, 44, 76 ] - recipe: "trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -3922,7 +3922,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [ 5 ] - recipe: "trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -3934,7 +3934,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [ 333 ] - recipe: "trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 4 @@ -3946,7 +3946,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 32 dp-attn: true - conc-list: [ 1229 ] - recipe: "trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml" prefill: num-worker: 7 tp: 4 @@ -3958,7 +3958,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 32 dp-attn: true - conc-list: [ 2253 ] - recipe: "trtllm/gb200-fp4/8k1k/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml" prefill: num-worker: 8 tp: 4 @@ -3970,7 +3970,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 16 dp-attn: true - conc-list: [ 4096 ] - recipe: "trtllm/gb200-fp4/8k1k/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml" + recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml" prefill: num-worker: 10 tp: 4 @@ -3999,7 +3999,7 @@ dsr1-fp8-gb200-dynamo-trt: search-space: - spec-decoding: "mtp" conc-list: [4301] - recipe: "trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml" + recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml" prefill: num-worker: 1 tp: 8 @@ -4012,7 +4012,7 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [2151] - recipe: "trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml" + recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml" prefill: num-worker: 1 tp: 8 @@ -4025,7 +4025,7 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [1229] - recipe: "trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml" + recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml" prefill: num-worker: 1 tp: 8 @@ -4038,7 +4038,7 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [615] - recipe: "trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml" + recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml" prefill: num-worker: 1 tp: 8 @@ -4051,7 +4051,7 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [36] - recipe: "trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml" + recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml" prefill: num-worker: 1 tp: 8 @@ -4064,7 +4064,7 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [18] - recipe: "trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml" + recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml" prefill: num-worker: 1 tp: 8 @@ -4077,7 +4077,7 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [9] - recipe: "trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml" + recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml" prefill: num-worker: 1 tp: 8 @@ -4090,7 +4090,7 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: false # 1k1k STP configs - conc-list: [6144] - recipe: "trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml" + recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml" prefill: num-worker: 1 tp: 8 @@ -4102,7 +4102,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [4301] - recipe: "trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml" + recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml" prefill: num-worker: 1 tp: 8 @@ -4114,7 +4114,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [2151] - recipe: "trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml" + recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml" prefill: num-worker: 1 tp: 8 @@ -4126,7 +4126,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 16 dp-attn: true - conc-list: [1127] - recipe: "trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml" + recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml" prefill: num-worker: 1 tp: 8 @@ -4138,7 +4138,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 32 dp-attn: true - conc-list: [256] - recipe: "trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml" + recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml" prefill: num-worker: 1 tp: 8 @@ -4150,7 +4150,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 32 dp-attn: true - conc-list: [27] - recipe: "trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml" + recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml" prefill: num-worker: 1 tp: 8 @@ -4162,7 +4162,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [3] - recipe: "trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml" + recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml" prefill: num-worker: 1 tp: 8 @@ -4179,7 +4179,7 @@ dsr1-fp8-gb200-dynamo-trt: search-space: - spec-decoding: "mtp" conc-list: [666] - recipe: "trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml" + recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml" prefill: num-worker: 3 tp: 8 @@ -4192,7 +4192,7 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [666] - recipe: "trtllm/gb200-fp8/8k1k/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml" + recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml" prefill: num-worker: 5 tp: 8 @@ -4205,7 +4205,7 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [333] - recipe: "trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml" + recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml" prefill: num-worker: 3 tp: 8 @@ -4218,7 +4218,7 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [333] - recipe: "trtllm/gb200-fp8/8k1k/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml" + recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml" prefill: num-worker: 4 tp: 8 @@ -4231,7 +4231,7 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [90] - recipe: "trtllm/gb200-fp8/8k1k/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml" + recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml" prefill: num-worker: 2 tp: 8 @@ -4244,7 +4244,7 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [15] - recipe: "trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml" + recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml" prefill: num-worker: 1 tp: 8 @@ -4257,7 +4257,7 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [6] - recipe: "trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml" + recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml" prefill: num-worker: 1 tp: 8 @@ -4270,7 +4270,7 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: false # 8k1k STP configs - conc-list: [1229] - recipe: "trtllm/gb200-fp8/8k1k/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml" + recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml" prefill: num-worker: 5 tp: 8 @@ -4282,7 +4282,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 16 dp-attn: true - conc-list: [666] - recipe: "trtllm/gb200-fp8/8k1k/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml" + recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml" prefill: num-worker: 4 tp: 8 @@ -4294,7 +4294,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 32 dp-attn: true - conc-list: [615] - recipe: "trtllm/gb200-fp8/8k1k/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml" + recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml" prefill: num-worker: 3 tp: 8 @@ -4306,7 +4306,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 16 dp-attn: true - conc-list: [333] - recipe: "trtllm/gb200-fp8/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml" + recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml" prefill: num-worker: 2 tp: 8 @@ -4318,7 +4318,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 32 dp-attn: true - conc-list: [63] - recipe: "trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml" + recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml" prefill: num-worker: 1 tp: 8 @@ -4330,7 +4330,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [18] - recipe: "trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml" + recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml" prefill: num-worker: 1 tp: 8 @@ -4342,7 +4342,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [6] - recipe: "trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml" + recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml" prefill: num-worker: 1 tp: 8 @@ -4370,7 +4370,7 @@ dsr1-fp8-gb200-dynamo-sglang: search-space: # "Low latency" (1 prefill worker at TP4 and 1 decode worker at TP4) - conc-list: [4, 8] - recipe: "gb200-fp8/1k1k/low-latency.yaml" + recipe: "dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/low-latency.yaml" prefill: num-worker: 1 tp: 4 @@ -4384,7 +4384,7 @@ dsr1-fp8-gb200-dynamo-sglang: # "Mid curve" (3 prefill workers at DEP8 and 1 decode worker at DEP48) - conc-list: [1024, 2048, 4096] - recipe: "gb200-fp8/1k1k/mid-curve.yaml" + recipe: "dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/mid-curve.yaml" prefill: num-worker: 3 tp: 8 @@ -4398,7 +4398,7 @@ dsr1-fp8-gb200-dynamo-sglang: # "Max throughput" (2 prefill workers at DEP8 and 1 decode worker at DEP32) - conc-list: [1024, 2048, 4096, 6144] - recipe: "gb200-fp8/1k1k/max-tpt.yaml" + recipe: "dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/max-tpt.yaml" prefill: num-worker: 2 tp: 8 @@ -4412,7 +4412,7 @@ dsr1-fp8-gb200-dynamo-sglang: # "Ultra throughput" (1 prefill workers at DEP8 and 1 decode worker at DEP8) - conc-list: [4096] - recipe: "gb200-fp8/1k1k/ultra-tpt.yaml" + recipe: "dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/ultra-tpt.yaml" prefill: num-worker: 1 tp: 8 @@ -4429,7 +4429,7 @@ dsr1-fp8-gb200-dynamo-sglang: search-space: # "Low latency" (1 prefill worker at TP8 and 1 decode worker at TP8) - conc-list: [4, 8, 16] - recipe: "gb200-fp8/8k1k/low-latency.yaml" + recipe: "dsr1/sglang/gb200-fp8/8k/1k/disagg/stp/low-latency.yaml" prefill: num-worker: 1 tp: 8 @@ -4443,7 +4443,7 @@ dsr1-fp8-gb200-dynamo-sglang: # "Mid curve" (5 prefill workers at DEP8 and 1 decode worker at DEP32) - conc-list: [512, 1024, 2048, 6144] - recipe: "gb200-fp8/8k1k/mid-curve.yaml" + recipe: "dsr1/sglang/gb200-fp8/8k/1k/disagg/stp/mid-curve.yaml" prefill: num-worker: 5 tp: 8 @@ -4457,7 +4457,7 @@ dsr1-fp8-gb200-dynamo-sglang: # "Max throughput" (6 prefill workers at DEP8 and 1 decode worker at DEP24) - conc-list: [2048, 4096, 6144] - recipe: "gb200-fp8/8k1k/max_tpt.yaml" + recipe: "dsr1/sglang/gb200-fp8/8k/1k/disagg/stp/max_tpt.yaml" prefill: num-worker: 6 tp: 8 @@ -4484,7 +4484,7 @@ dsr1-fp8-gb300-dynamo-sglang: search-space: # "Low latency" (1 prefill worker at TP4 and 4 decode workers at TP4) - conc-list: [4, 8, 16, 32] - recipe: "gb300-fp8/1k1k/stp/low-latency.yaml" + recipe: "dsr1/sglang/gb300-fp8/1k/1k/disagg/stp/low-latency.yaml" prefill: num-worker: 1 tp: 4 @@ -4498,7 +4498,7 @@ dsr1-fp8-gb300-dynamo-sglang: # "Mid curve" (2 prefill workers at DEP8 and 1 decode worker at DEP32) - conc-list: [1024, 2048, 4096, 6144] - recipe: "gb300-fp8/1k1k/stp/mid.yaml" + recipe: "dsr1/sglang/gb300-fp8/1k/1k/disagg/stp/mid.yaml" prefill: num-worker: 2 tp: 8 @@ -4512,7 +4512,7 @@ dsr1-fp8-gb300-dynamo-sglang: # "Max throughput" (1 prefill worker at DEP8 and 1 decode worker at DEP8) - conc-list: [4096, 7168, 7680] - recipe: "gb300-fp8/1k1k/stp/max.yaml" + recipe: "dsr1/sglang/gb300-fp8/1k/1k/disagg/stp/max.yaml" prefill: num-worker: 1 tp: 8 @@ -4529,7 +4529,7 @@ dsr1-fp8-gb300-dynamo-sglang: search-space: # "Low latency" (1 prefill worker at TP4 and 1 decode worker at TP4) - conc-list: [4, 8] - recipe: "gb300-fp8/8k1k/stp/low-latency.yaml" + recipe: "dsr1/sglang/gb300-fp8/8k/1k/disagg/stp/low-latency.yaml" prefill: num-worker: 1 tp: 4 @@ -4543,7 +4543,7 @@ dsr1-fp8-gb300-dynamo-sglang: # "Mid curve" (5 prefill workers at DEP8 and 1 decode worker at DEP32) - conc-list: [128, 256, 512, 1024] - recipe: "gb300-fp8/8k1k/stp/mid.yaml" + recipe: "dsr1/sglang/gb300-fp8/8k/1k/disagg/stp/mid.yaml" prefill: num-worker: 5 tp: 8 @@ -4557,7 +4557,7 @@ dsr1-fp8-gb300-dynamo-sglang: # "Max throughput" (6 prefill workers at DEP8 and 1 decode worker at DEP24) - conc-list: [2048, 4096] - recipe: "gb300-fp8/8k1k/stp/max.yaml" + recipe: "dsr1/sglang/gb300-fp8/8k/1k/disagg/stp/max.yaml" prefill: num-worker: 6 tp: 8 @@ -4586,7 +4586,7 @@ dsr1-fp4-gb200-dynamo-sglang: # Low latency (1 prefill node, 2 decode nodes) - spec-decoding: "none" conc-list: [ 4, 8, 32 ] - recipe: "gb200-fp4/1k1k/low-latency.yaml" + recipe: "dsr1/sglang/gb200-fp4/1k/1k/disagg/stp/low-latency.yaml" prefill: num-worker: 1 tp: 4 @@ -4601,7 +4601,7 @@ dsr1-fp4-gb200-dynamo-sglang: # Mid curve (4 prefill nodes, 8 decode nodes) - spec-decoding: "none" conc-list: [ 512, 2048, 4096, 8192 ] - recipe: "gb200-fp4/1k1k/mid-curve.yaml" + recipe: "dsr1/sglang/gb200-fp4/1k/1k/disagg/stp/mid-curve.yaml" prefill: num-worker: 4 tp: 4 @@ -4616,7 +4616,7 @@ dsr1-fp4-gb200-dynamo-sglang: # Max throughput (4 prefill nodes, 12 decode nodes) - spec-decoding: "none" conc-list: [ 2048, 4096 ] - recipe: "gb200-fp4/1k1k/max-tpt.yaml" + recipe: "dsr1/sglang/gb200-fp4/1k/1k/disagg/stp/max-tpt.yaml" prefill: num-worker: 4 tp: 4 @@ -4635,7 +4635,7 @@ dsr1-fp4-gb200-dynamo-sglang: # Low latency (1 prefill node, 4 decode nodes) - spec-decoding: "none" conc-list: [ 4, 8 ] - recipe: "gb200-fp4/8k1k/low-latency.yaml" + recipe: "dsr1/sglang/gb200-fp4/8k/1k/disagg/stp/low-latency.yaml" prefill: num-worker: 1 tp: 4 @@ -4650,7 +4650,7 @@ dsr1-fp4-gb200-dynamo-sglang: # Mid curve (6 prefill nodes, 12 decode nodes) - spec-decoding: "none" conc-list: [ 512, 2048, 4096 ] - recipe: "gb200-fp4/8k1k/mid-curve.yaml" + recipe: "dsr1/sglang/gb200-fp4/8k/1k/disagg/stp/mid-curve.yaml" prefill: num-worker: 6 tp: 4 @@ -4665,7 +4665,7 @@ dsr1-fp4-gb200-dynamo-sglang: # Max throughput (10 prefill nodes, 8 decode nodes) - spec-decoding: "none" conc-list: [ 2048 ] - recipe: "gb200-fp4/8k1k/max-tpt.yaml" + recipe: "dsr1/sglang/gb200-fp4/8k/1k/disagg/stp/max-tpt.yaml" prefill: num-worker: 10 tp: 4 @@ -4693,7 +4693,7 @@ dsr1-fp4-gb300-dynamo-trt: # MTP configurations - spec-decoding: "mtp" conc-list: [3226] - recipe: "trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml" + recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml" prefill: num-worker: 1 tp: 2 @@ -4706,7 +4706,7 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [333] - recipe: "trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml" + recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml" prefill: num-worker: 1 tp: 2 @@ -4719,7 +4719,7 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [5] - recipe: "trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 2 @@ -4732,7 +4732,7 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [8, 12, 24, 48] - recipe: "trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 2 @@ -4745,7 +4745,7 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [2253] - recipe: "trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml" + recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml" prefill: num-worker: 3 tp: 2 @@ -4758,7 +4758,7 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [1229] - recipe: "trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml" + recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml" prefill: num-worker: 3 tp: 2 @@ -4771,7 +4771,7 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: true # Non-MTP configurations (default spec_decoding="none") - conc-list: [5] - recipe: "trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 @@ -4783,7 +4783,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 8 dp-attn: false - conc-list: [12, 48, 96, 192] - recipe: "trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 @@ -4795,7 +4795,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 8 dp-attn: false - conc-list: [8192] - recipe: "trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 2 @@ -4807,7 +4807,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 8 dp-attn: true - conc-list: [1229] - recipe: "trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 2 @@ -4819,7 +4819,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 32 dp-attn: true - conc-list: [4301] - recipe: "trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml" + recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml" prefill: num-worker: 3 tp: 2 @@ -4831,7 +4831,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 16 dp-attn: true - conc-list: [2253] - recipe: "trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml" prefill: num-worker: 3 tp: 2 @@ -4848,7 +4848,7 @@ dsr1-fp4-gb300-dynamo-trt: # MTP configurations (spec_decoding="mtp") - spec-decoding: "mtp" conc-list: [33] - recipe: "trtllm/gb300-fp4/8k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 2 @@ -4861,7 +4861,7 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [5] - recipe: "trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 2 @@ -4874,7 +4874,7 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [12, 24] - recipe: "trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 2 @@ -4887,7 +4887,7 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [180] - recipe: "trtllm/gb300-fp4/8k1k/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml" prefill: num-worker: 4 tp: 2 @@ -4900,7 +4900,7 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [308] - recipe: "trtllm/gb300-fp4/8k1k/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml" prefill: num-worker: 8 tp: 2 @@ -4913,7 +4913,7 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [2253] - recipe: "trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml" + recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml" prefill: num-worker: 10 tp: 2 @@ -4926,7 +4926,7 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [666] - recipe: "trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml" prefill: num-worker: 10 tp: 2 @@ -4939,7 +4939,7 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [1127] - recipe: "trtllm/gb300-fp4/8k1k/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml" + recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml" prefill: num-worker: 13 tp: 2 @@ -4952,7 +4952,7 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: true # Non-MTP configurations (default spec_decoding="none") - conc-list: [72] - recipe: "trtllm/gb300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 @@ -4964,7 +4964,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 8 dp-attn: false - conc-list: [5] - recipe: "trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 @@ -4976,7 +4976,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 8 dp-attn: false - conc-list: [12] - recipe: "trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 @@ -4988,7 +4988,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 8 dp-attn: false - conc-list: [5, 15, 30] - recipe: "trtllm/gb300-fp4/8k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 @@ -5000,7 +5000,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 4 dp-attn: false - conc-list: [666] - recipe: "trtllm/gb300-fp4/8k1k/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml" prefill: num-worker: 7 tp: 2 @@ -5012,7 +5012,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 32 dp-attn: true - conc-list: [1229] - recipe: "trtllm/gb300-fp4/8k1k/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml" prefill: num-worker: 9 tp: 2 @@ -5024,7 +5024,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 16 dp-attn: true - conc-list: [3228] - recipe: "trtllm/gb300-fp4/8k1k/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml" prefill: num-worker: 11 tp: 2 @@ -5036,7 +5036,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 4 dp-attn: true - conc-list: [2253] - recipe: "trtllm/gb300-fp4/8k1k/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml" prefill: num-worker: 14 tp: 2 @@ -5065,7 +5065,7 @@ dsr1-fp4-gb300-dynamo-sglang: # Low latency (1 prefill node, 2 decode nodes) - spec-decoding: "none" conc-list: [ 4, 8, 32 ] - recipe: "gb300-fp4/1k1k/low_latency.yaml" + recipe: "dsr1/sglang/gb300-fp4/1k/1k/disagg/stp/low_latency.yaml" prefill: num-worker: 1 tp: 4 @@ -5080,7 +5080,7 @@ dsr1-fp4-gb300-dynamo-sglang: # Mid curve (4 prefill nodes, 8 decode nodes) - spec-decoding: "none" conc-list: [ 512, 2048, 4096, 8192 ] - recipe: "gb300-fp4/1k1k/mid_curve.yaml" + recipe: "dsr1/sglang/gb300-fp4/1k/1k/disagg/stp/mid_curve.yaml" prefill: num-worker: 4 tp: 4 @@ -5095,7 +5095,7 @@ dsr1-fp4-gb300-dynamo-sglang: # Max throughput (4 prefill nodes, 12 decode nodes) - spec-decoding: "none" conc-list: [ 512, 2048, 4096, 8192 ] - recipe: "gb300-fp4/1k1k/max_tpt.yaml" + recipe: "dsr1/sglang/gb300-fp4/1k/1k/disagg/stp/max_tpt.yaml" prefill: num-worker: 4 tp: 4 @@ -5114,7 +5114,7 @@ dsr1-fp4-gb300-dynamo-sglang: # Low latency (1 prefill node, 4 decode nodes) - spec-decoding: "none" conc-list: [ 4, 8, 32, 64 ] - recipe: "gb300-fp4/8k1k/low_latency.yaml" + recipe: "dsr1/sglang/gb300-fp4/8k/1k/disagg/stp/low_latency.yaml" prefill: num-worker: 1 tp: 4 @@ -5129,7 +5129,7 @@ dsr1-fp4-gb300-dynamo-sglang: # Mid curve (6 prefill nodes, 12 decode nodes) - spec-decoding: "none" conc-list: [ 512, 2048, 4096 ] - recipe: "gb300-fp4/8k1k/mid_curve.yaml" + recipe: "dsr1/sglang/gb300-fp4/8k/1k/disagg/stp/mid_curve.yaml" prefill: num-worker: 6 tp: 4 @@ -5144,7 +5144,7 @@ dsr1-fp4-gb300-dynamo-sglang: # Max throughput (10 prefill nodes, 8 decode nodes) - spec-decoding: "none" conc-list: [ 2048 ] - recipe: "gb300-fp4/8k1k/max_tpt.yaml" + recipe: "dsr1/sglang/gb300-fp4/8k/1k/disagg/stp/max_tpt.yaml" prefill: num-worker: 10 tp: 4 @@ -5172,7 +5172,7 @@ dsr1-fp8-gb300-dynamo-trt: # MTP configurations (spec_decoding="mtp") - spec-decoding: "mtp" conc-list: [8] - recipe: "trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml" + recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml" prefill: num-worker: 1 tp: 4 @@ -5185,7 +5185,7 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [24] - recipe: "trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml" + recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml" prefill: num-worker: 1 tp: 4 @@ -5198,7 +5198,7 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [180] - recipe: "trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml" + recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml" prefill: num-worker: 1 tp: 4 @@ -5211,7 +5211,7 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [564] - recipe: "trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml" + recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml" prefill: num-worker: 2 tp: 4 @@ -5224,7 +5224,7 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [666] - recipe: "trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml" + recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml" prefill: num-worker: 1 tp: 4 @@ -5237,7 +5237,7 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [2253] - recipe: "trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml" + recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml" prefill: num-worker: 2 tp: 4 @@ -5250,7 +5250,7 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [8192] - recipe: "trtllm/gb300-fp8/1k1k/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml" + recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml" prefill: num-worker: 3 tp: 4 @@ -5263,7 +5263,7 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: true # STP configurations (no spec_decoding) - conc-list: [4] - recipe: "trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml" + recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml" prefill: num-worker: 1 tp: 4 @@ -5275,7 +5275,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 8 dp-attn: false - conc-list: [24] - recipe: "trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml" + recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml" prefill: num-worker: 1 tp: 4 @@ -5287,7 +5287,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 8 dp-attn: false - conc-list: [84] - recipe: "trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml" + recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml" prefill: num-worker: 1 tp: 4 @@ -5299,7 +5299,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 8 dp-attn: false - conc-list: [1229] - recipe: "trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml" + recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml" prefill: num-worker: 2 tp: 4 @@ -5311,7 +5311,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 32 dp-attn: true - conc-list: [2253] - recipe: "trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml" + recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml" prefill: num-worker: 2 tp: 4 @@ -5323,7 +5323,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 16 dp-attn: true - conc-list: [8602] - recipe: "trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml" + recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml" prefill: num-worker: 3 tp: 4 @@ -5335,7 +5335,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 8 dp-attn: true - conc-list: [12288] - recipe: "trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml" + recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml" prefill: num-worker: 3 tp: 4 @@ -5352,7 +5352,7 @@ dsr1-fp8-gb300-dynamo-trt: # MTP configurations (spec_decoding="mtp") - spec-decoding: "mtp" conc-list: [8] - recipe: "trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml" + recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml" prefill: num-worker: 1 tp: 4 @@ -5365,7 +5365,7 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [24] - recipe: "trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml" + recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml" prefill: num-worker: 1 tp: 4 @@ -5378,7 +5378,7 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [333] - recipe: "trtllm/gb300-fp8/8k1k/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml" + recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml" prefill: num-worker: 6 tp: 4 @@ -5391,7 +5391,7 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [666] - recipe: "trtllm/gb300-fp8/8k1k/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml" + recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml" prefill: num-worker: 8 tp: 4 @@ -5404,7 +5404,7 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [1229] - recipe: "trtllm/gb300-fp8/8k1k/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml" + recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml" prefill: num-worker: 10 tp: 4 @@ -5417,7 +5417,7 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [1229] - recipe: "trtllm/gb300-fp8/8k1k/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml" + recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml" prefill: num-worker: 7 tp: 4 @@ -5430,7 +5430,7 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: true # STP configurations (no spec_decoding) - conc-list: [4] - recipe: "trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml" + recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml" prefill: num-worker: 1 tp: 4 @@ -5442,7 +5442,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 8 dp-attn: false - conc-list: [24] - recipe: "trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml" + recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml" prefill: num-worker: 1 tp: 4 @@ -5454,7 +5454,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 8 dp-attn: false - conc-list: [36] - recipe: "trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml" + recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml" prefill: num-worker: 1 tp: 4 @@ -5466,7 +5466,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 8 dp-attn: false - conc-list: [512] - recipe: "trtllm/gb300-fp8/8k1k/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml" + recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml" prefill: num-worker: 6 tp: 4 @@ -5478,7 +5478,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 32 dp-attn: true - conc-list: [666] - recipe: "trtllm/gb300-fp8/8k1k/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml" + recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml" prefill: num-worker: 4 tp: 4 @@ -5490,7 +5490,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 16 dp-attn: true - conc-list: [1229] - recipe: "trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml" + recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml" prefill: num-worker: 7 tp: 4 @@ -5502,7 +5502,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 16 dp-attn: true - conc-list: [2151] - recipe: "trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml" + recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml" prefill: num-worker: 7 tp: 4 @@ -5800,7 +5800,7 @@ dsr1-fp8-h200-dynamo-sglang: # STP: Low latency (1 prefill, 9 decode, TEP) - spec-decoding: "none" conc-list: [1, 4, 8, 16, 32, 64, 128, 256] - recipe: "h200/1k1k/low-latency-1p9d.yaml" + recipe: "dsr1/sglang/h200-fp8/1k/1k/disagg/stp/low-latency-1p9d.yaml" prefill: num-worker: 1 tp: 8 @@ -5814,7 +5814,7 @@ dsr1-fp8-h200-dynamo-sglang: # STP: High throughput TEP (1 prefill, 6 decode) - spec-decoding: "none" conc-list: [512, 1024, 2048] - recipe: "h200/1k1k/bs256-1p6d-tp.yaml" + recipe: "dsr1/sglang/h200-fp8/1k/1k/disagg/stp/bs256-1p6d-tp.yaml" prefill: num-worker: 1 tp: 8 @@ -5828,7 +5828,7 @@ dsr1-fp8-h200-dynamo-sglang: # STP: High throughput DEP (1 prefill, 6 decode, dp-attention) - spec-decoding: "none" conc-list: [128, 256, 512, 1024, 2048] - recipe: "h200/1k1k/bs256-1p6d-dep.yaml" + recipe: "dsr1/sglang/h200-fp8/1k/1k/disagg/stp/bs256-1p6d-dep.yaml" prefill: num-worker: 1 tp: 8 @@ -5842,7 +5842,7 @@ dsr1-fp8-h200-dynamo-sglang: # MTP: Low latency (1 prefill, 9 decode, TEP) - spec-decoding: "mtp" conc-list: [1, 4, 8, 16, 32, 64, 128, 256] - recipe: "h200/1k1k/low-latency-1p9d-mtp.yaml" + recipe: "dsr1/sglang/h200-fp8/1k/1k/disagg/mtp/low-latency-1p9d-mtp.yaml" prefill: num-worker: 1 tp: 8 @@ -5856,7 +5856,7 @@ dsr1-fp8-h200-dynamo-sglang: # MTP: High throughput TEP (1 prefill, 6 decode) - spec-decoding: "mtp" conc-list: [512, 1024, 2048] - recipe: "h200/1k1k/bs256-1p6d-tp-mtp.yaml" + recipe: "dsr1/sglang/h200-fp8/1k/1k/disagg/mtp/bs256-1p6d-tp-mtp.yaml" prefill: num-worker: 1 tp: 8 @@ -5870,7 +5870,7 @@ dsr1-fp8-h200-dynamo-sglang: # MTP: High throughput DEP (1 prefill, 6 decode, dp-attention) - spec-decoding: "mtp" conc-list: [128, 256, 512, 1024, 2048] - recipe: "h200/1k1k/bs256-1p6d-dep-mtp.yaml" + recipe: "dsr1/sglang/h200-fp8/1k/1k/disagg/mtp/bs256-1p6d-dep-mtp.yaml" prefill: num-worker: 1 tp: 8 @@ -5887,7 +5887,7 @@ dsr1-fp8-h200-dynamo-sglang: # STP: Low latency TEP (1 prefill, 7 decode) - spec-decoding: "none" conc-list: [1, 4, 8] - recipe: "h200/8k1k/bs4-1p7d.yaml" + recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs4-1p7d.yaml" prefill: num-worker: 1 tp: 8 @@ -5901,7 +5901,7 @@ dsr1-fp8-h200-dynamo-sglang: # STP: TEP (1 prefill, 6 decode) - spec-decoding: "none" conc-list: [4, 8, 16] - recipe: "h200/8k1k/bs8-1p6d.yaml" + recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs8-1p6d.yaml" prefill: num-worker: 1 tp: 8 @@ -5915,7 +5915,7 @@ dsr1-fp8-h200-dynamo-sglang: # STP: TEP (1 prefill, 3 decode) - spec-decoding: "none" conc-list: [8, 16, 32] - recipe: "h200/8k1k/bs16-1p3d.yaml" + recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs16-1p3d.yaml" prefill: num-worker: 1 tp: 8 @@ -5929,7 +5929,7 @@ dsr1-fp8-h200-dynamo-sglang: # STP: TEP (2 prefill, 3 decode) - spec-decoding: "none" conc-list: [32, 64, 128] - recipe: "h200/8k1k/bs64-2p3d.yaml" + recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs64-2p3d.yaml" prefill: num-worker: 2 tp: 8 @@ -5943,7 +5943,7 @@ dsr1-fp8-h200-dynamo-sglang: # STP: High throughput DEP (1 prefill, 1 decode, dp-attention) - spec-decoding: "none" conc-list: [64, 128, 256] - recipe: "h200/8k1k/bs128-1p1d-dep.yaml" + recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs128-1p1d-dep.yaml" prefill: num-worker: 1 tp: 8 @@ -5957,7 +5957,7 @@ dsr1-fp8-h200-dynamo-sglang: # MTP: Low latency TEP (1 prefill, 7 decode) - spec-decoding: "mtp" conc-list: [1, 4, 8] - recipe: "h200/8k1k/bs4-1p7d-mtp.yaml" + recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs4-1p7d-mtp.yaml" prefill: num-worker: 1 tp: 8 @@ -5971,7 +5971,7 @@ dsr1-fp8-h200-dynamo-sglang: # MTP: TEP (1 prefill, 6 decode) - spec-decoding: "mtp" conc-list: [2, 4, 8, 16, 32] - recipe: "h200/8k1k/bs8-1p6d-mtp.yaml" + recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs8-1p6d-mtp.yaml" prefill: num-worker: 1 tp: 8 @@ -5985,7 +5985,7 @@ dsr1-fp8-h200-dynamo-sglang: # MTP: TEP (1 prefill, 3 decode) - spec-decoding: "mtp" conc-list: [4, 8, 16, 32, 64] - recipe: "h200/8k1k/bs16-1p3d-mtp.yaml" + recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs16-1p3d-mtp.yaml" prefill: num-worker: 1 tp: 8 @@ -5999,7 +5999,7 @@ dsr1-fp8-h200-dynamo-sglang: # MTP: TEP (2 prefill, 3 decode) - spec-decoding: "mtp" conc-list: [32, 64, 128] - recipe: "h200/8k1k/bs64-2p3d-mtp.yaml" + recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs64-2p3d-mtp.yaml" prefill: num-worker: 2 tp: 8 @@ -6013,7 +6013,7 @@ dsr1-fp8-h200-dynamo-sglang: # MTP: High throughput DEP (1 prefill, 1 decode, dp-attention) - spec-decoding: "mtp" conc-list: [32, 64, 128, 256, 512] - recipe: "h200/8k1k/bs128-1p1d-dep-mtp.yaml" + recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs128-1p1d-dep-mtp.yaml" prefill: num-worker: 1 tp: 8 @@ -6040,7 +6040,7 @@ dsr1-fp4-b200-dynamo-sglang: search-space: # Non-MTP configurations - conc-list: [16, 128] - recipe: "b200-fp4/1k1k.yaml:zip_override_stp_lowlat[0]" + recipe: "dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml:zip_override_stp_lowlat[0]" prefill: num-worker: 1 tp: 4 @@ -6052,7 +6052,7 @@ dsr1-fp4-b200-dynamo-sglang: ep: 8 dp-attn: false - conc-list: [32, 64, 256] - recipe: "b200-fp4/1k1k.yaml:zip_override_stp_lowlat[1]" + recipe: "dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml:zip_override_stp_lowlat[1]" prefill: num-worker: 1 tp: 4 @@ -6064,7 +6064,7 @@ dsr1-fp4-b200-dynamo-sglang: ep: 8 dp-attn: false - conc-list: [512] - recipe: "b200-fp4/1k1k.yaml:zip_override_stp_maxtpt[0]" + recipe: "dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml:zip_override_stp_maxtpt[0]" prefill: num-worker: 1 tp: 4 @@ -6076,7 +6076,7 @@ dsr1-fp4-b200-dynamo-sglang: ep: 8 dp-attn: true - conc-list: [512] - recipe: "b200-fp4/1k1k.yaml:zip_override_stp_maxtpt[1]" + recipe: "dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml:zip_override_stp_maxtpt[1]" prefill: num-worker: 1 tp: 4 @@ -6092,7 +6092,7 @@ dsr1-fp4-b200-dynamo-sglang: search-space: # Non-MTP configurations - conc-list: [64, 128] - recipe: "b200-fp4/8k1k.yaml:zip_override_stp_lowlat[0]" + recipe: "dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml:zip_override_stp_lowlat[0]" prefill: num-worker: 1 tp: 4 @@ -6104,7 +6104,7 @@ dsr1-fp4-b200-dynamo-sglang: ep: 8 dp-attn: false - conc-list: [8] - recipe: "b200-fp4/8k1k.yaml:zip_override_stp_lowlat[1]" + recipe: "dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml:zip_override_stp_lowlat[1]" prefill: num-worker: 1 tp: 4 @@ -6116,7 +6116,7 @@ dsr1-fp4-b200-dynamo-sglang: ep: 8 dp-attn: false - conc-list: [4, 128] - recipe: "b200-fp4/8k1k.yaml:zip_override_stp_lowlat[2]" + recipe: "dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml:zip_override_stp_lowlat[2]" prefill: num-worker: 2 tp: 4 @@ -6128,7 +6128,7 @@ dsr1-fp4-b200-dynamo-sglang: ep: 8 dp-attn: false - conc-list: [4, 8, 16, 64] - recipe: "b200-fp4/8k1k.yaml:override_stp_tp4" + recipe: "dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml:override_stp_tp4" prefill: num-worker: 1 tp: 4 @@ -6140,7 +6140,7 @@ dsr1-fp4-b200-dynamo-sglang: ep: 1 dp-attn: false - conc-list: [1024, 2048] - recipe: "b200-fp4/8k1k.yaml:override_stp_maxtpt_7p2d" + recipe: "dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml:override_stp_maxtpt_7p2d" prefill: num-worker: 7 tp: 4 @@ -6167,7 +6167,7 @@ dsr1-fp8-b200-dynamo-sglang: search-space: # Non-MTP configurations - conc-list: [4] - recipe: "b200-fp8/1k1k.yaml:zip_override_stp_lowlat[0]" + recipe: "dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml:zip_override_stp_lowlat[0]" prefill: num-worker: 1 tp: 8 @@ -6179,7 +6179,7 @@ dsr1-fp8-b200-dynamo-sglang: ep: 8 dp-attn: false - conc-list: [16, 32, 64, 128, 256] - recipe: "b200-fp8/1k1k.yaml:zip_override_stp_lowlat[1]" + recipe: "dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml:zip_override_stp_lowlat[1]" prefill: num-worker: 1 tp: 8 @@ -6191,7 +6191,7 @@ dsr1-fp8-b200-dynamo-sglang: ep: 8 dp-attn: false - conc-list: [1024, 2048, 4096] - recipe: "b200-fp8/1k1k.yaml:zip_override_stp_maxtpt[0]" + recipe: "dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml:zip_override_stp_maxtpt[0]" prefill: num-worker: 1 tp: 8 @@ -6203,7 +6203,7 @@ dsr1-fp8-b200-dynamo-sglang: ep: 8 dp-attn: true - conc-list: [2048, 4096] - recipe: "b200-fp8/1k1k.yaml:zip_override_stp_maxtpt[1]" + recipe: "dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml:zip_override_stp_maxtpt[1]" prefill: num-worker: 2 tp: 8 @@ -6219,7 +6219,7 @@ dsr1-fp8-b200-dynamo-sglang: search-space: # STP low-latency: resolved from 8k1k.yaml zip_override_stp_lowlat - conc-list: [128] - recipe: "b200-fp8/8k1k_stp_lowlat_0.yaml" + recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_lowlat_0.yaml" prefill: num-worker: 1 tp: 8 @@ -6231,7 +6231,7 @@ dsr1-fp8-b200-dynamo-sglang: ep: 1 dp-attn: false - conc-list: [128] - recipe: "b200-fp8/8k1k_stp_lowlat_1.yaml" + recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_lowlat_1.yaml" prefill: num-worker: 1 tp: 8 @@ -6243,7 +6243,7 @@ dsr1-fp8-b200-dynamo-sglang: ep: 1 dp-attn: false - conc-list: [8, 16, 32, 64, 128] - recipe: "b200-fp8/8k1k_stp_lowlat_2.yaml" + recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_lowlat_2.yaml" prefill: num-worker: 1 tp: 8 @@ -6256,7 +6256,7 @@ dsr1-fp8-b200-dynamo-sglang: dp-attn: false # STP max-throughput: resolved from 8k1k.yaml zip_override_stp_maxtpt - conc-list: [288] - recipe: "b200-fp8/8k1k_stp_maxtpt_0.yaml" + recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_0.yaml" prefill: num-worker: 1 tp: 8 @@ -6268,7 +6268,7 @@ dsr1-fp8-b200-dynamo-sglang: ep: 8 dp-attn: true - conc-list: [160, 288] - recipe: "b200-fp8/8k1k_stp_maxtpt_1.yaml" + recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_1.yaml" prefill: num-worker: 1 tp: 8 @@ -6280,7 +6280,7 @@ dsr1-fp8-b200-dynamo-sglang: ep: 8 dp-attn: true - conc-list: [512] - recipe: "b200-fp8/8k1k_stp_maxtpt_2.yaml" + recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_2.yaml" prefill: num-worker: 2 tp: 8 @@ -6292,7 +6292,7 @@ dsr1-fp8-b200-dynamo-sglang: ep: 8 dp-attn: true - conc-list: [1024] - recipe: "b200-fp8/8k1k_stp_maxtpt_3.yaml" + recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_3.yaml" prefill: num-worker: 3 tp: 8 @@ -6320,7 +6320,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: # MTP low-latency: 1P1D - spec-decoding: "mtp" conc-list: [4, 64] - recipe: "b200-fp8/1k1k.yaml:zip_override_mtp_lowlat[0]" + recipe: "dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml:zip_override_mtp_lowlat[0]" prefill: num-worker: 1 tp: 8 @@ -6334,7 +6334,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: # MTP low-latency: 1P3D - spec-decoding: "mtp" conc-list: [4, 8, 16, 32, 128] - recipe: "b200-fp8/1k1k.yaml:zip_override_mtp_lowlat[1]" + recipe: "dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml:zip_override_mtp_lowlat[1]" prefill: num-worker: 1 tp: 8 @@ -6348,7 +6348,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: # MTP max-tpt: 1P5D - spec-decoding: "mtp" conc-list: [512, 4096] - recipe: "b200-fp8/1k1k.yaml:zip_override_mtp_maxtpt[1]" + recipe: "dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml:zip_override_mtp_maxtpt[1]" prefill: num-worker: 1 tp: 8 @@ -6362,7 +6362,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: # MTP max-tpt: 2P5D - spec-decoding: "mtp" conc-list: [1024, 2048, 4096] - recipe: "b200-fp8/1k1k.yaml:zip_override_mtp_maxtpt[2]" + recipe: "dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml:zip_override_mtp_maxtpt[2]" prefill: num-worker: 2 tp: 8 @@ -6376,7 +6376,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: # MTP max-tpt: 1P2D - spec-decoding: "mtp" conc-list: [512, 1024, 2048] - recipe: "b200-fp8/1k1k.yaml:override_mtp_maxtpt_1p2d" + recipe: "dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml:override_mtp_maxtpt_1p2d" prefill: num-worker: 1 tp: 8 @@ -6393,7 +6393,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: # MTP low-latency: resolved from 8k1k.yaml zip_override_mtp_lowlat - spec-decoding: "mtp" conc-list: [128] - recipe: "b200-fp8/8k1k_mtp_lowlat_0.yaml" + recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml" prefill: num-worker: 1 tp: 8 @@ -6406,7 +6406,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: dp-attn: false - spec-decoding: "mtp" conc-list: [128] - recipe: "b200-fp8/8k1k_mtp_lowlat_1.yaml" + recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml" prefill: num-worker: 1 tp: 8 @@ -6419,7 +6419,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: dp-attn: false - spec-decoding: "mtp" conc-list: [8, 16, 32, 64, 128] - recipe: "b200-fp8/8k1k_mtp_lowlat_2.yaml" + recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml" prefill: num-worker: 1 tp: 8 @@ -6433,7 +6433,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: # MTP max-throughput: resolved from 8k1k.yaml zip_override_mtp_maxtpt - spec-decoding: "mtp" conc-list: [288] - recipe: "b200-fp8/8k1k_mtp_maxtpt_0.yaml" + recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_0.yaml" prefill: num-worker: 1 tp: 8 @@ -6446,7 +6446,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: dp-attn: true - spec-decoding: "mtp" conc-list: [160, 288] - recipe: "b200-fp8/8k1k_mtp_maxtpt_1.yaml" + recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_1.yaml" prefill: num-worker: 1 tp: 8 @@ -6459,7 +6459,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: dp-attn: true - spec-decoding: "mtp" conc-list: [512] - recipe: "b200-fp8/8k1k_mtp_maxtpt_2.yaml" + recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_2.yaml" prefill: num-worker: 2 tp: 8 @@ -6472,7 +6472,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: dp-attn: true - spec-decoding: "mtp" conc-list: [1024] - recipe: "b200-fp8/8k1k_mtp_maxtpt_3.yaml" + recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_3.yaml" prefill: num-worker: 3 tp: 8 @@ -6499,7 +6499,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp: search-space: - spec-decoding: "mtp" conc-list: [16, 512] - recipe: "b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[0]" + recipe: "dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml:zip_override_mtp_lowlat[0]" prefill: num-worker: 1 tp: 4 @@ -6512,7 +6512,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp: dp-attn: false - spec-decoding: "mtp" conc-list: [32, 64, 256, 512] - recipe: "b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[1]" + recipe: "dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml:zip_override_mtp_lowlat[1]" prefill: num-worker: 1 tp: 4 @@ -6525,7 +6525,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp: dp-attn: false - spec-decoding: "mtp" conc-list: [512, 1024] - recipe: "b200-fp4/1k1k.yaml:zip_override_mtp_maxtpt[0]" + recipe: "dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml:zip_override_mtp_maxtpt[0]" prefill: num-worker: 1 tp: 4 @@ -6538,7 +6538,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp: dp-attn: true - spec-decoding: "mtp" conc-list: [512] - recipe: "b200-fp4/1k1k.yaml:zip_override_mtp_maxtpt[1]" + recipe: "dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml:zip_override_mtp_maxtpt[1]" prefill: num-worker: 1 tp: 4 @@ -6557,7 +6557,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp: search-space: - spec-decoding: "mtp" conc-list: [64, 128] - recipe: "b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[0]" + recipe: "dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml:zip_override_mtp_lowlat[0]" prefill: num-worker: 1 tp: 4 @@ -6570,7 +6570,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp: dp-attn: false - spec-decoding: "mtp" conc-list: [8] - recipe: "b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[1]" + recipe: "dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml:zip_override_mtp_lowlat[1]" prefill: num-worker: 1 tp: 4 @@ -6583,7 +6583,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp: dp-attn: false - spec-decoding: "mtp" conc-list: [4, 128] - recipe: "b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[2]" + recipe: "dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml:zip_override_mtp_lowlat[2]" prefill: num-worker: 2 tp: 4 @@ -6596,7 +6596,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp: dp-attn: false - spec-decoding: "mtp" conc-list: [4, 8, 16, 64] - recipe: "b200-fp4/8k1k.yaml:override_mtp_tp4" + recipe: "dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml:override_mtp_tp4" prefill: num-worker: 1 tp: 4 @@ -6623,7 +6623,7 @@ kimik2.5-fp4-gb200-dynamo-trt: search-space: # Non-MTP configurations (default spec_decoding="none") - conc-list: [ 4, 192, 360, 668 ] - recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml" + recipe: "kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -6635,7 +6635,7 @@ kimik2.5-fp4-gb200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [ 5, 15, 30, 55 ] - recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml" + recipe: "kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -6647,7 +6647,7 @@ kimik2.5-fp4-gb200-dynamo-trt: ep: 4 dp-attn: false - conc-list: [ 666 ] - recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml" + recipe: "kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -6659,7 +6659,7 @@ kimik2.5-fp4-gb200-dynamo-trt: ep: 16 dp-attn: true - conc-list: [ 2253 ] - recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml" + recipe: "kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -6671,7 +6671,7 @@ kimik2.5-fp4-gb200-dynamo-trt: ep: 32 dp-attn: true - conc-list: [ 4301, 6452 ] - recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml" + recipe: "kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -6683,7 +6683,7 @@ kimik2.5-fp4-gb200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [ 4301 ] - recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml" + recipe: "kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 4 @@ -6695,7 +6695,7 @@ kimik2.5-fp4-gb200-dynamo-trt: ep: 16 dp-attn: true - conc-list: [ 4301 ] - recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml" + recipe: "kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 4 @@ -6712,7 +6712,7 @@ kimik2.5-fp4-gb200-dynamo-trt: search-space: # Non-MTP configurations (default spec_decoding="none") - conc-list: [ 4 ] - recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml" + recipe: "kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -6724,7 +6724,7 @@ kimik2.5-fp4-gb200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [ 156 ] - recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml" + recipe: "kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -6736,7 +6736,7 @@ kimik2.5-fp4-gb200-dynamo-trt: ep: 4 dp-attn: false - conc-list: [ 5, 15, 30, 60, 105 ] - recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml" + recipe: "kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -6748,7 +6748,7 @@ kimik2.5-fp4-gb200-dynamo-trt: ep: 4 dp-attn: false - conc-list: [ 333 ] - recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml" + recipe: "kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 4 @@ -6760,7 +6760,7 @@ kimik2.5-fp4-gb200-dynamo-trt: ep: 16 dp-attn: true - conc-list: [ 615 ] - recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml" + recipe: "kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml" prefill: num-worker: 3 tp: 4 @@ -6772,7 +6772,7 @@ kimik2.5-fp4-gb200-dynamo-trt: ep: 16 dp-attn: true - conc-list: [ 2151 ] - recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml" + recipe: "kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml" prefill: num-worker: 5 tp: 4 @@ -6784,7 +6784,7 @@ kimik2.5-fp4-gb200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [ 2253 ] - recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml" + recipe: "kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml" prefill: num-worker: 7 tp: 4 @@ -6810,7 +6810,7 @@ kimik2.5-fp4-gb200-dynamo-vllm: osl: 1024 search-space: - conc-list: [256, 512, 1024, 2048, 3072, 4096] - recipe: "vllm/kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml" + recipe: "kimik2.5/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p1d-dep4-dep16.yaml" prefill: num-worker: 1 tp: 4 @@ -6822,7 +6822,7 @@ kimik2.5-fp4-gb200-dynamo-vllm: ep: 16 dp-attn: true - conc-list: [4, 8, 16, 32, 64, 128] - recipe: "vllm/kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml" + recipe: "kimik2.5/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml" prefill: num-worker: 1 tp: 4 @@ -6837,7 +6837,7 @@ kimik2.5-fp4-gb200-dynamo-vllm: osl: 1024 search-space: - conc-list: [4, 8, 16, 32, 128] - recipe: "vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml" + recipe: "kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml" prefill: num-worker: 1 tp: 4 @@ -6849,7 +6849,7 @@ kimik2.5-fp4-gb200-dynamo-vllm: ep: 4 dp-attn: false - conc-list: [512, 1024] - recipe: "vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml" + recipe: "kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-3p1d-dep4-dep16.yaml" prefill: num-worker: 3 tp: 4 @@ -6861,7 +6861,7 @@ kimik2.5-fp4-gb200-dynamo-vllm: ep: 16 dp-attn: true - conc-list: [2048] - recipe: "vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml" + recipe: "kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-5p1d-dep4-dep8.yaml" prefill: num-worker: 5 tp: 4 @@ -6873,7 +6873,7 @@ kimik2.5-fp4-gb200-dynamo-vllm: ep: 8 dp-attn: true - conc-list: [3072, 4096] - recipe: "vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml" + recipe: "kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-6p1d-dep4-dep16.yaml" prefill: num-worker: 6 tp: 4 @@ -6905,7 +6905,7 @@ dsv4-fp4-gb200-dynamo-vllm: # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch # 1p1d-dep8-tep8.yaml (offload + numa-bind stripped — see recipe header). - conc-list: [1, 4, 8, 16, 32, 64] - recipe: "vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml" + recipe: "dsv4/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml" prefill: num-worker: 1 tp: 8 @@ -6919,7 +6919,7 @@ dsv4-fp4-gb200-dynamo-vllm: # Mid throughput: 1 prefill (DP=8) + 1 wide decode (DP=16). # 6 nodes. Single prefill is plenty for 1k prompts up to ~conc 4096. - conc-list: [128, 256, 1024, 2048, 4096] - recipe: "vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml" + recipe: "dsv4/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p1d-dep8-dep16.yaml" prefill: num-worker: 1 tp: 8 @@ -6934,7 +6934,7 @@ dsv4-fp4-gb200-dynamo-vllm: # The 4096 overlap with the 1p1d block gives a crossover point. 8192 # would saturate 1p1d's prefill, so this topology takes over there. - conc-list: [4096, 8192] - recipe: "vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml" + recipe: "dsv4/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml" prefill: num-worker: 3 tp: 8 @@ -6952,7 +6952,7 @@ dsv4-fp4-gb200-dynamo-vllm: # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8). # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch. - conc-list: [1, 4, 8, 16, 32, 64] - recipe: "vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml" + recipe: "dsv4/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml" prefill: num-worker: 1 tp: 8 @@ -6965,7 +6965,7 @@ dsv4-fp4-gb200-dynamo-vllm: dp-attn: false # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total. - conc-list: [512, 1024] - recipe: "vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml" + recipe: "dsv4/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml" prefill: num-worker: 3 tp: 8 @@ -6979,7 +6979,7 @@ dsv4-fp4-gb200-dynamo-vllm: # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes # (full cluster). Mirrors NVIDIA/srt-slurm PR #67. - conc-list: [4096, 8192] - recipe: "vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml" + recipe: "dsv4/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-7p1d-dep8-dep16.yaml" prefill: num-worker: 7 tp: 8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp4/1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/b200-fp4/1k1k.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp4/8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/b200-fp4/8k1k.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/b200-fp8/1k1k.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_lowlat_0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_lowlat_1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_lowlat_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_lowlat_2.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_2.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_2.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_2.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_lowlat_0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_lowlat_0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_lowlat_0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_lowlat_1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_lowlat_1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_lowlat_1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_lowlat_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_lowlat_2.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_lowlat_2.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_lowlat_2.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_2.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_2.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_2.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/1k1k/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k/1k/disagg/stp/low-latency.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/1k1k/low-latency.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k/1k/disagg/stp/low-latency.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/1k1k/max-tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k/1k/disagg/stp/max-tpt.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/1k1k/max-tpt.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k/1k/disagg/stp/max-tpt.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/1k1k/mid-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k/1k/disagg/stp/mid-curve.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/1k1k/mid-curve.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k/1k/disagg/stp/mid-curve.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/8k1k/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k/1k/disagg/stp/low-latency.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/8k1k/low-latency.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k/1k/disagg/stp/low-latency.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/8k1k/max-tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k/1k/disagg/stp/max-tpt.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/8k1k/max-tpt.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k/1k/disagg/stp/max-tpt.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/8k1k/mid-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k/1k/disagg/stp/mid-curve.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/8k1k/mid-curve.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k/1k/disagg/stp/mid-curve.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/low-latency.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/low-latency.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/low-latency.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/max-tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/max-tpt.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/max-tpt.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/max-tpt.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/mid-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/mid-curve.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/mid-curve.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/mid-curve.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/ultra-tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/ultra-tpt.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/ultra-tpt.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/ultra-tpt.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/8k1k/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k/1k/disagg/stp/low-latency.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/8k1k/low-latency.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k/1k/disagg/stp/low-latency.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/8k1k/max_tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k/1k/disagg/stp/max_tpt.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/8k1k/max_tpt.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k/1k/disagg/stp/max_tpt.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/8k1k/mid-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k/1k/disagg/stp/mid-curve.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/8k1k/mid-curve.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k/1k/disagg/stp/mid-curve.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/1k1k/low_latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k/1k/disagg/stp/low_latency.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/1k1k/low_latency.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k/1k/disagg/stp/low_latency.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/1k1k/max_tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k/1k/disagg/stp/max_tpt.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/1k1k/max_tpt.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k/1k/disagg/stp/max_tpt.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/1k1k/mid_curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k/1k/disagg/stp/mid_curve.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/1k1k/mid_curve.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k/1k/disagg/stp/mid_curve.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/8k1k/low_latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k/1k/disagg/stp/low_latency.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/8k1k/low_latency.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k/1k/disagg/stp/low_latency.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/8k1k/max_tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k/1k/disagg/stp/max_tpt.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/8k1k/max_tpt.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k/1k/disagg/stp/max_tpt.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/8k1k/mid_curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k/1k/disagg/stp/mid_curve.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/8k1k/mid_curve.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k/1k/disagg/stp/mid_curve.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/1k1k/stp/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k/1k/disagg/stp/low-latency.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/1k1k/stp/low-latency.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k/1k/disagg/stp/low-latency.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/1k1k/stp/max.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k/1k/disagg/stp/max.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/1k1k/stp/max.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k/1k/disagg/stp/max.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/1k1k/stp/mid.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k/1k/disagg/stp/mid.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/1k1k/stp/mid.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k/1k/disagg/stp/mid.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/8k1k/stp/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k/1k/disagg/stp/low-latency.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/8k1k/stp/low-latency.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k/1k/disagg/stp/low-latency.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/8k1k/stp/max.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k/1k/disagg/stp/max.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/8k1k/stp/max.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k/1k/disagg/stp/max.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/8k1k/stp/mid.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k/1k/disagg/stp/mid.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/8k1k/stp/mid.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k/1k/disagg/stp/mid.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k/1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k/1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/mtp/h100-fp8-1p2d-max-tp-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k/1k/disagg/mtp/h100-fp8-1p2d-max-tp-mtp.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/mtp/h100-fp8-1p2d-max-tp-mtp.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k/1k/disagg/mtp/h100-fp8-1p2d-max-tp-mtp.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/stp/h100-fp8-1p1d-max-dep.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k/1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/stp/h100-fp8-1p1d-max-dep.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k/1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/stp/h100-fp8-1p2d-max-tp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k/1k/disagg/stp/h100-fp8-1p2d-max-tp.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/stp/h100-fp8-1p2d-max-tp.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k/1k/disagg/stp/h100-fp8-1p2d-max-tp.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k/1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k/1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/mtp/h100-fp8-1p1d-max-tp-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k/1k/disagg/mtp/h100-fp8-1p1d-max-tp-mtp.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/mtp/h100-fp8-1p1d-max-tp-mtp.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k/1k/disagg/mtp/h100-fp8-1p1d-max-tp-mtp.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/stp/h100-fp8-1p1d-max-dep.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k/1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/stp/h100-fp8-1p1d-max-dep.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k/1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/stp/h100-fp8-1p1d-max-tp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k/1k/disagg/stp/h100-fp8-1p1d-max-tp.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/stp/h100-fp8-1p1d-max-tp.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k/1k/disagg/stp/h100-fp8-1p1d-max-tp.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-dep-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/mtp/bs256-1p6d-dep-mtp.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-dep-mtp.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/mtp/bs256-1p6d-dep-mtp.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-tp-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/mtp/bs256-1p6d-tp-mtp.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-tp-mtp.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/mtp/bs256-1p6d-tp-mtp.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/low-latency-1p9d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/mtp/low-latency-1p9d-mtp.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/low-latency-1p9d-mtp.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/mtp/low-latency-1p9d-mtp.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-dep.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/stp/bs256-1p6d-dep.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-dep.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/stp/bs256-1p6d-dep.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-tp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/stp/bs256-1p6d-tp.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-tp.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/stp/bs256-1p6d-tp.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/low-latency-1p9d.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/stp/low-latency-1p9d.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/low-latency-1p9d.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/stp/low-latency-1p9d.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs128-1p1d-dep-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs128-1p1d-dep-mtp.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs128-1p1d-dep-mtp.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs128-1p1d-dep-mtp.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs16-1p3d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs16-1p3d-mtp.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs16-1p3d-mtp.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs16-1p3d-mtp.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs4-1p7d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs4-1p7d-mtp.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs4-1p7d-mtp.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs4-1p7d-mtp.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs64-2p3d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs64-2p3d-mtp.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs64-2p3d-mtp.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs64-2p3d-mtp.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs8-1p6d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs8-1p6d-mtp.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs8-1p6d-mtp.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs8-1p6d-mtp.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs128-1p1d-dep.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs128-1p1d-dep.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs128-1p1d-dep.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs128-1p1d-dep.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs16-1p3d.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs16-1p3d.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs16-1p3d.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs16-1p3d.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs4-1p7d.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs4-1p7d.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs4-1p7d.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs4-1p7d.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs64-2p3d.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs64-2p3d.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs64-2p3d.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs64-2p3d.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs8-1p6d.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs8-1p6d.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs8-1p6d.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs8-1p6d.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p1d-dep8-dep16.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p1d-dep8-dep16.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-7p1d-dep8-dep16.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-7p1d-dep8-dep16.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p1d-dep4-dep16.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p1d-dep4-dep16.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-3p1d-dep4-dep16.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-3p1d-dep4-dep16.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-5p1d-dep4-dep8.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-5p1d-dep4-dep8.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-6p1d-dep4-dep16.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-6p1d-dep4-dep16.yaml From aa430b5baf5314f6dcc4829e190fceb299c074b3 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 28 Apr 2026 13:57:14 -0500 Subject: [PATCH 05/16] srt-slurm: collapse split // recipe dirs into / Per request, drop the awkward `1k/1k/` two-segment intermediate in the recipe tree in favor of `1k1k/`. New shape: //-////.yaml 370 files renamed, 393 recipe references in nvidia-master.yaml rewritten, schema validation + tests still green. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/CONFIGS.md | 4 +- .github/configs/nvidia-master.yaml | 786 +++++++++--------- .../b200-fp4/{1k/1k => 1k1k}/disagg/1k1k.yaml | 0 .../b200-fp4/{8k/1k => 8k1k}/disagg/8k1k.yaml | 0 .../b200-fp8/{1k/1k => 1k1k}/disagg/1k1k.yaml | 0 .../disagg/mtp/8k1k_mtp_lowlat_0.yaml | 0 .../disagg/mtp/8k1k_mtp_lowlat_1.yaml | 0 .../disagg/mtp/8k1k_mtp_lowlat_2.yaml | 0 .../disagg/mtp/8k1k_mtp_maxtpt_0.yaml | 0 .../disagg/mtp/8k1k_mtp_maxtpt_1.yaml | 0 .../disagg/mtp/8k1k_mtp_maxtpt_2.yaml | 0 .../disagg/mtp/8k1k_mtp_maxtpt_3.yaml | 0 .../disagg/stp/8k1k_stp_lowlat_0.yaml | 0 .../disagg/stp/8k1k_stp_lowlat_1.yaml | 0 .../disagg/stp/8k1k_stp_lowlat_2.yaml | 0 .../disagg/stp/8k1k_stp_maxtpt_0.yaml | 0 .../disagg/stp/8k1k_stp_maxtpt_1.yaml | 0 .../disagg/stp/8k1k_stp_maxtpt_2.yaml | 0 .../disagg/stp/8k1k_stp_maxtpt_3.yaml | 0 .../1k => 1k1k}/disagg/stp/low-latency.yaml | 0 .../{1k/1k => 1k1k}/disagg/stp/max-tpt.yaml | 0 .../{1k/1k => 1k1k}/disagg/stp/mid-curve.yaml | 0 .../1k => 8k1k}/disagg/stp/low-latency.yaml | 0 .../{8k/1k => 8k1k}/disagg/stp/max-tpt.yaml | 0 .../{8k/1k => 8k1k}/disagg/stp/mid-curve.yaml | 0 .../1k => 1k1k}/disagg/stp/low-latency.yaml | 0 .../{1k/1k => 1k1k}/disagg/stp/max-tpt.yaml | 0 .../{1k/1k => 1k1k}/disagg/stp/mid-curve.yaml | 0 .../{1k/1k => 1k1k}/disagg/stp/ultra-tpt.yaml | 0 .../1k => 8k1k}/disagg/stp/low-latency.yaml | 0 .../{8k/1k => 8k1k}/disagg/stp/max_tpt.yaml | 0 .../{8k/1k => 8k1k}/disagg/stp/mid-curve.yaml | 0 .../1k => 1k1k}/disagg/stp/low_latency.yaml | 0 .../{1k/1k => 1k1k}/disagg/stp/max_tpt.yaml | 0 .../{1k/1k => 1k1k}/disagg/stp/mid_curve.yaml | 0 .../1k => 8k1k}/disagg/stp/low_latency.yaml | 0 .../{8k/1k => 8k1k}/disagg/stp/max_tpt.yaml | 0 .../{8k/1k => 8k1k}/disagg/stp/mid_curve.yaml | 0 .../1k => 1k1k}/disagg/stp/low-latency.yaml | 0 .../{1k/1k => 1k1k}/disagg/stp/max.yaml | 0 .../{1k/1k => 1k1k}/disagg/stp/mid.yaml | 0 .../1k => 8k1k}/disagg/stp/low-latency.yaml | 0 .../{8k/1k => 8k1k}/disagg/stp/max.yaml | 0 .../{8k/1k => 8k1k}/disagg/stp/mid.yaml | 0 .../disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml | 0 .../disagg/mtp/h100-fp8-1p2d-max-tp-mtp.yaml | 0 .../disagg/stp/h100-fp8-1p1d-max-dep.yaml | 0 .../disagg/stp/h100-fp8-1p2d-max-tp.yaml | 0 .../disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml | 0 .../disagg/mtp/h100-fp8-1p1d-max-tp-mtp.yaml | 0 .../disagg/stp/h100-fp8-1p1d-max-dep.yaml | 0 .../disagg/stp/h100-fp8-1p1d-max-tp.yaml | 0 .../disagg/mtp/bs256-1p6d-dep-mtp.yaml | 0 .../disagg/mtp/bs256-1p6d-tp-mtp.yaml | 0 .../disagg/mtp/low-latency-1p9d-mtp.yaml | 0 .../disagg/stp/bs256-1p6d-dep.yaml | 0 .../1k => 1k1k}/disagg/stp/bs256-1p6d-tp.yaml | 0 .../disagg/stp/low-latency-1p9d.yaml | 0 .../disagg/mtp/bs128-1p1d-dep-mtp.yaml | 0 .../1k => 8k1k}/disagg/mtp/bs16-1p3d-mtp.yaml | 0 .../1k => 8k1k}/disagg/mtp/bs4-1p7d-mtp.yaml | 0 .../1k => 8k1k}/disagg/mtp/bs64-2p3d-mtp.yaml | 0 .../1k => 8k1k}/disagg/mtp/bs8-1p6d-mtp.yaml | 0 .../disagg/stp/bs128-1p1d-dep.yaml | 0 .../{8k/1k => 8k1k}/disagg/stp/bs16-1p3d.yaml | 0 .../{8k/1k => 8k1k}/disagg/stp/bs4-1p7d.yaml | 0 .../{8k/1k => 8k1k}/disagg/stp/bs64-2p3d.yaml | 0 .../{8k/1k => 8k1k}/disagg/stp/bs8-1p6d.yaml | 0 .../ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml | 0 .../ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml | 0 .../mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml | 0 .../ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml | 0 .../ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml | 0 .../ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml | 0 .../ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml | 0 .../ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml | 0 .../ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml | 0 .../stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml | 0 .../ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml | 0 .../ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml | 0 .../mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml | 0 .../ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml | 0 .../mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml | 0 .../mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml | 0 .../ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml | 0 .../ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml | 0 .../ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml | 0 .../stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml | 0 .../stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml | 0 .../ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml | 0 .../ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml | 0 .../ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml | 0 .../ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml | 0 ...x1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml | 0 ...x1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml | 0 ...x1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml | 0 ...tx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml | 0 .../ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml | 0 .../ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml | 0 .../ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml | 0 .../ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml | 0 ...x1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml | 0 ...tx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml | 0 ...ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml | 0 .../ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml | 0 ...tx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml | 0 ...x2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml | 0 .../ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml | 0 .../ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml | 0 .../ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml | 0 .../ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml | 0 ...ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml | 0 .../ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml | 0 ...x4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml | 0 ...tx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml | 0 ...tx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml | 0 .../ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml | 0 ...ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml | 0 .../ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml | 0 .../ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml | 0 .../ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml | 0 ...tx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml | 0 .../ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml | 0 .../ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml | 0 .../mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml | 0 .../ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml | 0 .../ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml | 0 .../ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml | 0 .../ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml | 0 .../ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml | 0 .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml | 0 .../stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml | 0 .../ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml | 0 .../ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml | 0 .../ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml | 0 .../ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml | 0 .../ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml | 0 .../mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml | 0 .../mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml | 0 .../mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml | 0 .../ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml | 0 .../ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml | 0 .../ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml | 0 .../ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml | 0 .../stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml | 0 .../stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml | 0 .../ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml | 0 .../ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml | 0 .../ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml | 0 ...tx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml | 0 ...x1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml | 0 ...ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml | 0 .../ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml | 0 .../ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml | 0 ...x3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml | 0 ...x1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml | 0 ...tx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml | 0 ...ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml | 0 .../ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml | 0 .../ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml | 0 .../ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml | 0 ...2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml | 0 .../ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml | 0 .../ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml | 0 .../ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml | 0 .../ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml | 0 .../ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml | 0 .../ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml | 0 .../ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml | 0 .../ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml | 0 .../ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml | 0 .../ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml | 0 .../ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml | 0 ...tx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml | 0 ...x7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml | 0 .../ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml | 0 .../mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml | 0 ...ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml | 0 .../ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml | 0 .../ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml | 0 .../ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml | 0 .../ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml | 0 .../ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml | 0 .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml | 0 .../ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml | 0 ...ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml | 0 .../ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml | 0 ...tx11_gen1_dep16_batch256_eplb256_mtp1.yaml | 0 .../mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml | 0 .../ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml | 0 .../ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml | 0 .../ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml | 0 ...tx10_gen1_dep16_batch256_eplb256_mtp0.yaml | 0 .../ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml | 0 .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml | 0 .../ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml | 0 .../ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml | 0 .../ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml | 0 ...x1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml | 0 ...tx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml | 0 ...x1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml | 0 ...x1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml | 0 .../ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml | 0 .../ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml | 0 .../ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml | 0 ...1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml | 0 ...x1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml | 0 ...ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml | 0 ...x1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml | 0 ...x1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml | 0 .../ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml | 0 .../ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml | 0 .../ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml | 0 .../ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml | 0 .../ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml | 0 ...tx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml | 0 ...ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml | 0 ...ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml | 0 ...tx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml | 0 .../ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml | 0 .../ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml | 0 .../ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml | 0 ...ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml | 0 ...tx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml | 0 ...tx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml | 0 ...x5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml | 0 .../mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml | 0 .../ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml | 0 .../mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml | 0 .../mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml | 0 ...ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml | 0 .../ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml | 0 .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml | 0 .../ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml | 0 .../ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml | 0 .../ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml | 0 ...ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml | 0 .../ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml | 0 .../ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml | 0 .../ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml | 0 ...ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml | 0 .../mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml | 0 .../mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml | 0 .../mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml | 0 .../ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml | 0 .../ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml | 0 .../ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml | 0 .../ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml | 0 .../ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml | 0 .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml | 0 .../stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml | 0 .../stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml | 0 .../ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml | 0 .../ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml | 0 ...tx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml | 0 ...ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml | 0 .../ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml | 0 .../ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml | 0 ...2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml | 0 ...tx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml | 0 ...x3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml | 0 .../ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml | 0 .../ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml | 0 .../ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml | 0 ...2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml | 0 ...x2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml | 0 ...x3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml | 0 ...3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml | 0 ...10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml | 0 .../ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml | 0 .../ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml | 0 ...ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml | 0 ...x7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml | 0 ...tx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml | 0 .../ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml | 0 .../ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml | 0 .../ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml | 0 ...tx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml | 0 ...tx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml | 0 ...x7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml | 0 ...x7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml | 0 .../ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml | 0 .../ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml | 0 .../ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml | 0 .../ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml | 0 .../ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml | 0 .../ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml | 0 .../ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml | 0 .../ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml | 0 .../ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml | 0 .../ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml | 0 .../ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml | 0 .../ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml | 0 .../ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml | 0 .../ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml | 0 .../ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml | 0 .../ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml | 0 .../ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml | 0 .../ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml | 0 .../ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml | 0 .../ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml | 0 .../ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml | 0 .../ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml | 0 .../ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml | 0 .../ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml | 0 .../ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml | 0 .../ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml | 0 .../ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml | 0 .../ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml | 0 .../ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml | 0 ...28_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml | 0 ...16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml | 0 .../c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml | 0 ...56_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml | 0 ...2_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml | 0 ...4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml | 0 ...12_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml | 0 ...64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml | 0 ...8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml | 0 ...28_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml | 0 ...16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml | 0 .../c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml | 0 ...56_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml | 0 ...32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml | 0 ...c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml | 0 ...12_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml | 0 ...64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml | 0 ...c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml | 0 ...128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml | 0 ...c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml | 0 .../c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml | 0 ...256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml | 0 ...c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml | 0 .../c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml | 0 ...512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml | 0 ...c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml | 0 .../c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml | 0 ...28_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml | 0 ...c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml | 0 .../c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml | 0 ...56_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml | 0 ...32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml | 0 .../c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml | 0 ...12_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml | 0 ...64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml | 0 .../c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml | 0 .../stp/disagg-gb200-1p1d-dep8-dep16.yaml | 0 .../stp/disagg-gb200-1p1d-dep8-tep8.yaml | 0 .../stp/disagg-gb200-3p1d-dep8-dep16.yaml | 0 .../stp/disagg-gb200-1p1d-dep8-tep8.yaml | 0 .../stp/disagg-gb200-3p1d-dep8-dep16.yaml | 0 .../stp/disagg-gb200-7p1d-dep8-dep16.yaml | 0 ...ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml | 0 ...ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml | 0 ..._gen1dep8_batch768_allconc_eplb0_mtp0.yaml | 0 ..._gen4tep8_batch128_allconc_eplb0_mtp0.yaml | 0 ...p4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml | 0 ...tx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml | 0 ...tx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml | 0 ...4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml | 0 ...p4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml | 0 ...4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml | 0 ...ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml | 0 ...ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml | 0 ..._gen1dep8_batch256_allconc_eplb0_mtp0.yaml | 0 ...tx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml | 0 .../stp/disagg-gb200-1p1d-dep4-dep16.yaml | 0 .../stp/disagg-gb200-1p4d-dep4-tep4.yaml | 0 .../stp/disagg-gb200-1p4d-dep4-tep4.yaml | 0 .../stp/disagg-gb200-3p1d-dep4-dep16.yaml | 0 .../stp/disagg-gb200-5p1d-dep4-dep8.yaml | 0 .../stp/disagg-gb200-6p1d-dep4-dep16.yaml | 0 372 files changed, 395 insertions(+), 395 deletions(-) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp4/{1k/1k => 1k1k}/disagg/1k1k.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp4/{8k/1k => 8k1k}/disagg/8k1k.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/{1k/1k => 1k1k}/disagg/1k1k.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/{8k/1k => 8k1k}/disagg/mtp/8k1k_mtp_lowlat_0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/{8k/1k => 8k1k}/disagg/mtp/8k1k_mtp_lowlat_1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/{8k/1k => 8k1k}/disagg/mtp/8k1k_mtp_lowlat_2.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/{8k/1k => 8k1k}/disagg/mtp/8k1k_mtp_maxtpt_0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/{8k/1k => 8k1k}/disagg/mtp/8k1k_mtp_maxtpt_1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/{8k/1k => 8k1k}/disagg/mtp/8k1k_mtp_maxtpt_2.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/{8k/1k => 8k1k}/disagg/mtp/8k1k_mtp_maxtpt_3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/{8k/1k => 8k1k}/disagg/stp/8k1k_stp_lowlat_0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/{8k/1k => 8k1k}/disagg/stp/8k1k_stp_lowlat_1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/{8k/1k => 8k1k}/disagg/stp/8k1k_stp_lowlat_2.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/{8k/1k => 8k1k}/disagg/stp/8k1k_stp_maxtpt_0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/{8k/1k => 8k1k}/disagg/stp/8k1k_stp_maxtpt_1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/{8k/1k => 8k1k}/disagg/stp/8k1k_stp_maxtpt_2.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/{8k/1k => 8k1k}/disagg/stp/8k1k_stp_maxtpt_3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/low-latency.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/max-tpt.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/mid-curve.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/low-latency.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/max-tpt.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/mid-curve.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/{1k/1k => 1k1k}/disagg/stp/low-latency.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/{1k/1k => 1k1k}/disagg/stp/max-tpt.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/{1k/1k => 1k1k}/disagg/stp/mid-curve.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/{1k/1k => 1k1k}/disagg/stp/ultra-tpt.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/{8k/1k => 8k1k}/disagg/stp/low-latency.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/{8k/1k => 8k1k}/disagg/stp/max_tpt.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/{8k/1k => 8k1k}/disagg/stp/mid-curve.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/{1k/1k => 1k1k}/disagg/stp/low_latency.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/{1k/1k => 1k1k}/disagg/stp/max_tpt.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/{1k/1k => 1k1k}/disagg/stp/mid_curve.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/{8k/1k => 8k1k}/disagg/stp/low_latency.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/{8k/1k => 8k1k}/disagg/stp/max_tpt.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/{8k/1k => 8k1k}/disagg/stp/mid_curve.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/{1k/1k => 1k1k}/disagg/stp/low-latency.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/{1k/1k => 1k1k}/disagg/stp/max.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/{1k/1k => 1k1k}/disagg/stp/mid.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/{8k/1k => 8k1k}/disagg/stp/low-latency.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/{8k/1k => 8k1k}/disagg/stp/max.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/{8k/1k => 8k1k}/disagg/stp/mid.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/{1k/1k => 1k1k}/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/{1k/1k => 1k1k}/disagg/mtp/h100-fp8-1p2d-max-tp-mtp.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/{1k/1k => 1k1k}/disagg/stp/h100-fp8-1p1d-max-dep.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/{1k/1k => 1k1k}/disagg/stp/h100-fp8-1p2d-max-tp.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/{8k/1k => 8k1k}/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/{8k/1k => 8k1k}/disagg/mtp/h100-fp8-1p1d-max-tp-mtp.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/{8k/1k => 8k1k}/disagg/stp/h100-fp8-1p1d-max-dep.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/{8k/1k => 8k1k}/disagg/stp/h100-fp8-1p1d-max-tp.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/{1k/1k => 1k1k}/disagg/mtp/bs256-1p6d-dep-mtp.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/{1k/1k => 1k1k}/disagg/mtp/bs256-1p6d-tp-mtp.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/{1k/1k => 1k1k}/disagg/mtp/low-latency-1p9d-mtp.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/{1k/1k => 1k1k}/disagg/stp/bs256-1p6d-dep.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/{1k/1k => 1k1k}/disagg/stp/bs256-1p6d-tp.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/{1k/1k => 1k1k}/disagg/stp/low-latency-1p9d.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/{8k/1k => 8k1k}/disagg/mtp/bs128-1p1d-dep-mtp.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/{8k/1k => 8k1k}/disagg/mtp/bs16-1p3d-mtp.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/{8k/1k => 8k1k}/disagg/mtp/bs4-1p7d-mtp.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/{8k/1k => 8k1k}/disagg/mtp/bs64-2p3d-mtp.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/{8k/1k => 8k1k}/disagg/mtp/bs8-1p6d-mtp.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/{8k/1k => 8k1k}/disagg/stp/bs128-1p1d-dep.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/{8k/1k => 8k1k}/disagg/stp/bs16-1p3d.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/{8k/1k => 8k1k}/disagg/stp/bs4-1p7d.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/{8k/1k => 8k1k}/disagg/stp/bs64-2p3d.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/{8k/1k => 8k1k}/disagg/stp/bs8-1p6d.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{8k/1k => 8k1k}/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{8k/1k => 8k1k}/disagg/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{8k/1k => 8k1k}/disagg/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{8k/1k => 8k1k}/disagg/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{8k/1k => 8k1k}/disagg/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{8k/1k => 8k1k}/disagg/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{1k/1k => 1k1k}/disagg/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{8k/1k => 8k1k}/disagg/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{8k/1k => 8k1k}/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{8k/1k => 8k1k}/disagg/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{8k/1k => 8k1k}/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{8k/1k => 8k1k}/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{8k/1k => 8k1k}/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{8k/1k => 8k1k}/disagg/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{8k/1k => 8k1k}/disagg/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{1k/1k => 1k1k}/disagg/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{1k/1k => 1k1k}/disagg/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{1k/1k => 1k1k}/disagg/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{8k/1k => 8k1k}/disagg/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{8k/1k => 8k1k}/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{8k/1k => 8k1k}/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{8k/1k => 8k1k}/disagg/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{8k/1k => 8k1k}/disagg/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{8k/1k => 8k1k}/disagg/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{8k/1k => 8k1k}/disagg/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{1k/1k => 1k1k}/disagg/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{8k/1k => 8k1k}/disagg/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{8k/1k => 8k1k}/disagg/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{8k/1k => 8k1k}/disagg/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{8k/1k => 8k1k}/disagg/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{8k/1k => 8k1k}/disagg/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{8k/1k => 8k1k}/disagg/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{8k/1k => 8k1k}/disagg/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{8k/1k => 8k1k}/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{8k/1k => 8k1k}/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{8k/1k => 8k1k}/disagg/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{8k/1k => 8k1k}/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{8k/1k => 8k1k}/disagg/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{8k/1k => 8k1k}/disagg/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{8k/1k => 8k1k}/disagg/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{1k/1k => 1k1k}/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{1k/1k => 1k1k}/disagg/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{1k/1k => 1k1k}/disagg/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{1k/1k => 1k1k}/disagg/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{8k/1k => 8k1k}/disagg/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{8k/1k => 8k1k}/disagg/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{8k/1k => 8k1k}/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{8k/1k => 8k1k}/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{8k/1k => 8k1k}/disagg/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{8k/1k => 8k1k}/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{8k/1k => 8k1k}/disagg/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{8k/1k => 8k1k}/disagg/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{1k/1k => 1k1k}/disagg/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{1k/1k => 1k1k}/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{1k/1k => 1k1k}/disagg/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{1k/1k => 1k1k}/disagg/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{8k/1k => 8k1k}/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{8k/1k => 8k1k}/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{8k/1k => 8k1k}/disagg/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{8k/1k => 8k1k}/disagg/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{8k/1k => 8k1k}/disagg/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{8k/1k => 8k1k}/disagg/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{8k/1k => 8k1k}/disagg/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{1k/1k => 1k1k}/disagg/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{8k/1k => 8k1k}/disagg/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{8k/1k => 8k1k}/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{8k/1k => 8k1k}/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{8k/1k => 8k1k}/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{8k/1k => 8k1k}/disagg/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{1k/1k => 1k1k}/disagg/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{1k/1k => 1k1k}/disagg/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{1k/1k => 1k1k}/disagg/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{1k/1k => 1k1k}/disagg/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{1k/1k => 1k1k}/disagg/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{1k/1k => 1k1k}/disagg/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{1k/1k => 1k1k}/disagg/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{1k/1k => 1k1k}/disagg/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{1k/1k => 1k1k}/disagg/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{1k/1k => 1k1k}/disagg/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{1k/1k => 1k1k}/disagg/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{1k/1k => 1k1k}/disagg/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{1k/1k => 1k1k}/disagg/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{1k/1k => 1k1k}/disagg/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{1k/1k => 1k1k}/disagg/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{1k/1k => 1k1k}/disagg/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{1k/1k => 1k1k}/disagg/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{1k/1k => 1k1k}/disagg/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{8k/1k => 8k1k}/disagg/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{8k/1k => 8k1k}/disagg/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{8k/1k => 8k1k}/disagg/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{8k/1k => 8k1k}/disagg/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{8k/1k => 8k1k}/disagg/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{8k/1k => 8k1k}/disagg/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{8k/1k => 8k1k}/disagg/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{8k/1k => 8k1k}/disagg/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{8k/1k => 8k1k}/disagg/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{8k/1k => 8k1k}/disagg/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{8k/1k => 8k1k}/disagg/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{8k/1k => 8k1k}/disagg/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{8k/1k => 8k1k}/disagg/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{8k/1k => 8k1k}/disagg/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{8k/1k => 8k1k}/disagg/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{8k/1k => 8k1k}/disagg/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{8k/1k => 8k1k}/disagg/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{8k/1k => 8k1k}/disagg/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/disagg-gb200-1p1d-dep8-dep16.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/disagg-gb200-7p1d-dep8-dep16.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/disagg-gb200-1p1d-dep4-dep16.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/disagg-gb200-3p1d-dep4-dep16.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/disagg-gb200-5p1d-dep4-dep8.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/disagg-gb200-6p1d-dep4-dep16.yaml (100%) diff --git a/.github/configs/CONFIGS.md b/.github/configs/CONFIGS.md index 482c9acfc..46755ef31 100644 --- a/.github/configs/CONFIGS.md +++ b/.github/configs/CONFIGS.md @@ -69,9 +69,9 @@ search-space: ``` - `recipe` is a path **relative to `benchmarks/multi_node/srt-slurm-recipes/`** in this repo. The schema validator rejects entries whose recipe file does not exist on disk, so adding a new entry requires upstreaming the recipe yaml here first. -- The path may carry an `:override[N]` / `:override_` suffix to select a named override section inside an sglang-style recipe yaml (e.g. `"dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml:zip_override_mtp_lowlat[0]"`). The launcher strips this suffix before reading the file but passes the full string to `srtctl`. +- The path may carry an `:override[N]` / `:override_` suffix to select a named override section inside an sglang-style recipe yaml (e.g. `"dsr1/sglang/b200-fp4/1k1k/disagg/1k1k.yaml:zip_override_mtp_lowlat[0]"`). The launcher strips this suffix before reading the file but passes the full string to `srtctl`. - `recipe` is optional: multi-node entries that do *not* go through srt-slurm (e.g. dynamo-sglang aggregated topologies that drive their own bash) leave it unset. -- Recipes live under `benchmarks/multi_node/srt-slurm-recipes/` organized as `//-/////.yaml` — e.g. `dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml`. A handful of sglang-style files that carry override sections spanning both stp and mtp are parked one level shallower (the trailing `/` segment is omitted). The benchmark template resolves `recipe` to an absolute path and passes it to the launcher as `CONFIG_FILE`, so launchers do not see the relative form. +- Recipes live under `benchmarks/multi_node/srt-slurm-recipes/` organized as `//-////.yaml` — e.g. `dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml`. A handful of sglang-style files that carry override sections spanning both stp and mtp are parked one level shallower (the trailing `/` segment is omitted). The benchmark template resolves `recipe` to an absolute path and passes it to the launcher as `CONFIG_FILE`, so launchers do not see the relative form. ## Runners diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index bb59f1dd0..9ff2a96aa 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -13,7 +13,7 @@ dsr1-fp4-b200-dynamo-trt: search-space: - spec-decoding: "mtp" conc-list: [1214] - recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml" + recipe: "dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml" prefill: num-worker: 1 tp: 4 @@ -26,7 +26,7 @@ dsr1-fp4-b200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [875] - recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 4 @@ -39,7 +39,7 @@ dsr1-fp4-b200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [6] - recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 4 @@ -52,7 +52,7 @@ dsr1-fp4-b200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [10, 15, 25, 45, 90, 180] - recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 4 @@ -65,7 +65,7 @@ dsr1-fp4-b200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [ 4968 ] - recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml" + recipe: "dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml" prefill: num-worker: 3 tp: 4 @@ -78,7 +78,7 @@ dsr1-fp4-b200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [10860] - recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml" + recipe: "dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml" prefill: num-worker: 3 tp: 4 @@ -92,7 +92,7 @@ dsr1-fp4-b200-dynamo-trt: # Non-MTP configurations - conc-list: [4096] - recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -104,7 +104,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [2192] - recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -116,7 +116,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [1365] - recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -128,7 +128,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [6] - recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -140,7 +140,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [10, 15, 25, 45, 90, 180] - recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -152,7 +152,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [450] - recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -169,7 +169,7 @@ dsr1-fp4-b200-dynamo-trt: search-space: - spec-decoding: "mtp" conc-list: [90] - recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 4 @@ -182,7 +182,7 @@ dsr1-fp4-b200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [66] - recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 4 @@ -195,7 +195,7 @@ dsr1-fp4-b200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [6] - recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 4 @@ -208,7 +208,7 @@ dsr1-fp4-b200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [10, 15, 30, 60] - recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 4 @@ -221,7 +221,7 @@ dsr1-fp4-b200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [548] - recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml" prefill: num-worker: 3 tp: 4 @@ -234,7 +234,7 @@ dsr1-fp4-b200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [1096, 1691] - recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml" + recipe: "dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml" prefill: num-worker: 5 tp: 4 @@ -247,7 +247,7 @@ dsr1-fp4-b200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [658] - recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml" prefill: num-worker: 5 tp: 4 @@ -261,7 +261,7 @@ dsr1-fp4-b200-dynamo-trt: # Non-MTP configurations - conc-list: [6] - recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -273,7 +273,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [10, 15, 25, 50, 100] - recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -285,7 +285,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [370] - recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 4 @@ -297,7 +297,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [1606] - recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml" prefill: num-worker: 4 tp: 4 @@ -309,7 +309,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [837] - recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml" prefill: num-worker: 4 tp: 4 @@ -321,7 +321,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [2222] - recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml" prefill: num-worker: 7 tp: 4 @@ -349,7 +349,7 @@ dsr1-fp8-b200-dynamo-trt: # MTP configurations - Low latency (TP attention) - spec-decoding: "mtp" conc-list: [8] - recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml" + recipe: "dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml" prefill: num-worker: 1 tp: 8 @@ -362,7 +362,7 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [32] - recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml" + recipe: "dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml" prefill: num-worker: 1 tp: 8 @@ -375,7 +375,7 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [64] - recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml" + recipe: "dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml" prefill: num-worker: 1 tp: 8 @@ -388,7 +388,7 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [256] - recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml" + recipe: "dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml" prefill: num-worker: 1 tp: 8 @@ -402,7 +402,7 @@ dsr1-fp8-b200-dynamo-trt: # MTP configurations - High throughput (DP attention) - spec-decoding: "mtp" conc-list: [896] - recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml" + recipe: "dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml" prefill: num-worker: 1 tp: 8 @@ -415,7 +415,7 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [1024] - recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml" + recipe: "dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml" prefill: num-worker: 1 tp: 8 @@ -428,7 +428,7 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [1184] - recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml" + recipe: "dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml" prefill: num-worker: 1 tp: 8 @@ -441,7 +441,7 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [1600] - recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml" + recipe: "dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml" prefill: num-worker: 1 tp: 8 @@ -455,7 +455,7 @@ dsr1-fp8-b200-dynamo-trt: # Non-MTP (STP) configurations - Low latency (TP attention) - conc-list: [4] - recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml" + recipe: "dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml" prefill: num-worker: 1 tp: 8 @@ -467,7 +467,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: false - conc-list: [32] - recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml" + recipe: "dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml" prefill: num-worker: 1 tp: 8 @@ -479,7 +479,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: false - conc-list: [128] - recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml" + recipe: "dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml" prefill: num-worker: 1 tp: 8 @@ -492,7 +492,7 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: false # Non-MTP (STP) configurations - High throughput (DP attention) - conc-list: [1920] - recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml" + recipe: "dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml" prefill: num-worker: 1 tp: 8 @@ -504,7 +504,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [4096] - recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml" + recipe: "dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml" prefill: num-worker: 1 tp: 8 @@ -516,7 +516,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [5152] - recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml" + recipe: "dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml" prefill: num-worker: 2 tp: 8 @@ -534,7 +534,7 @@ dsr1-fp8-b200-dynamo-trt: # MTP configurations - Low latency (TP attention) - spec-decoding: "mtp" conc-list: [8] - recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml" + recipe: "dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml" prefill: num-worker: 1 tp: 8 @@ -547,7 +547,7 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [8] - recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml" + recipe: "dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml" prefill: num-worker: 1 tp: 8 @@ -560,7 +560,7 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [48] - recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml" + recipe: "dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml" prefill: num-worker: 1 tp: 8 @@ -573,7 +573,7 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [64] - recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml" + recipe: "dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml" prefill: num-worker: 1 tp: 8 @@ -587,7 +587,7 @@ dsr1-fp8-b200-dynamo-trt: # MTP configurations - High throughput (DP attention) - spec-decoding: "mtp" conc-list: [224] - recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml" + recipe: "dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml" prefill: num-worker: 2 tp: 8 @@ -600,7 +600,7 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [288] - recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml" + recipe: "dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml" prefill: num-worker: 2 tp: 8 @@ -613,7 +613,7 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [1088] - recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml" + recipe: "dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml" prefill: num-worker: 4 tp: 8 @@ -627,7 +627,7 @@ dsr1-fp8-b200-dynamo-trt: # Non-MTP (STP) configurations - Low latency (TP attention) - conc-list: [1] - recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml" + recipe: "dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml" prefill: num-worker: 1 tp: 8 @@ -639,7 +639,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: false - conc-list: [32] - recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml" + recipe: "dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml" prefill: num-worker: 1 tp: 8 @@ -651,7 +651,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: false - conc-list: [128] - recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml" + recipe: "dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml" prefill: num-worker: 1 tp: 8 @@ -663,7 +663,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: false - conc-list: [96] - recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml" + recipe: "dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml" prefill: num-worker: 1 tp: 8 @@ -676,7 +676,7 @@ dsr1-fp8-b200-dynamo-trt: dp-attn: false # Non-MTP (STP) configurations - High throughput (DP attention) - conc-list: [128] - recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml" + recipe: "dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml" prefill: num-worker: 1 tp: 8 @@ -688,7 +688,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [128] - recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml" + recipe: "dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml" prefill: num-worker: 1 tp: 8 @@ -700,7 +700,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [256] - recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml" + recipe: "dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml" prefill: num-worker: 1 tp: 8 @@ -712,7 +712,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [640] - recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml" + recipe: "dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml" prefill: num-worker: 2 tp: 8 @@ -740,7 +740,7 @@ dsr1-fp4-b300-dynamo-trt: search-space: - spec-decoding: "mtp" conc-list: [654] - recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 2 @@ -753,7 +753,7 @@ dsr1-fp4-b300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [271] - recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 2 @@ -766,7 +766,7 @@ dsr1-fp4-b300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [11] - recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 2 @@ -779,7 +779,7 @@ dsr1-fp4-b300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [10, 20, 25, 60, 120, 200] - recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 2 @@ -792,7 +792,7 @@ dsr1-fp4-b300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [2342] - recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml" + recipe: "dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml" prefill: num-worker: 2 tp: 2 @@ -805,7 +805,7 @@ dsr1-fp4-b300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [8609] - recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml" + recipe: "dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml" prefill: num-worker: 5 tp: 2 @@ -818,7 +818,7 @@ dsr1-fp4-b300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [12926] - recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml" + recipe: "dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml" prefill: num-worker: 5 tp: 2 @@ -832,7 +832,7 @@ dsr1-fp4-b300-dynamo-trt: # Non-MTP configurations - conc-list: [1176] - recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 @@ -844,7 +844,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 8 dp-attn: true - conc-list: [6] - recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 @@ -856,7 +856,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 8 dp-attn: false - conc-list: [5, 10, 15, 25] - recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 @@ -868,7 +868,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 4 dp-attn: false - conc-list: [60, 110, 195, 395] - recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 @@ -880,7 +880,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 8 dp-attn: false - conc-list: [4405] - recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 2 @@ -892,7 +892,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 8 dp-attn: true - conc-list: [8192] - recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml" prefill: num-worker: 3 tp: 2 @@ -904,7 +904,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 8 dp-attn: true - conc-list: [4611] - recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml" prefill: num-worker: 3 tp: 2 @@ -921,7 +921,7 @@ dsr1-fp4-b300-dynamo-trt: search-space: - spec-decoding: "mtp" conc-list: [2198] - recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml" + recipe: "dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml" prefill: num-worker: 10 tp: 2 @@ -934,7 +934,7 @@ dsr1-fp4-b300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [52] - recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 2 @@ -947,7 +947,7 @@ dsr1-fp4-b300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [8] - recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 2 @@ -960,7 +960,7 @@ dsr1-fp4-b300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [32] - recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 2 @@ -973,7 +973,7 @@ dsr1-fp4-b300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [181] - recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml" prefill: num-worker: 3 tp: 2 @@ -986,7 +986,7 @@ dsr1-fp4-b300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [1197] - recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml" + recipe: "dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml" prefill: num-worker: 9 tp: 2 @@ -1000,7 +1000,7 @@ dsr1-fp4-b300-dynamo-trt: # Non-MTP configurations - conc-list: [105] - recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 @@ -1012,7 +1012,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 4 dp-attn: false - conc-list: [63] - recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 @@ -1024,7 +1024,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 8 dp-attn: false - conc-list: [4] - recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 @@ -1036,7 +1036,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 8 dp-attn: false - conc-list: [12] - recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 @@ -1048,7 +1048,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 4 dp-attn: false - conc-list: [589] - recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml" prefill: num-worker: 5 tp: 2 @@ -1060,7 +1060,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 8 dp-attn: true - conc-list: [1093] - recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml" prefill: num-worker: 6 tp: 2 @@ -1072,7 +1072,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 8 dp-attn: true - conc-list: [2048] - recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml" prefill: num-worker: 8 tp: 2 @@ -1100,7 +1100,7 @@ dsr1-fp8-b300-dynamo-trt: search-space: - spec-decoding: "mtp" conc-list: [10] - recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml" + recipe: "dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml" prefill: num-worker: 1 tp: 4 @@ -1113,7 +1113,7 @@ dsr1-fp8-b300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [160] - recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml" + recipe: "dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml" prefill: num-worker: 1 tp: 4 @@ -1126,7 +1126,7 @@ dsr1-fp8-b300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [3072] - recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml" + recipe: "dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml" prefill: num-worker: 1 tp: 4 @@ -1139,7 +1139,7 @@ dsr1-fp8-b300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [2560] - recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml" + recipe: "dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml" prefill: num-worker: 1 tp: 4 @@ -1152,7 +1152,7 @@ dsr1-fp8-b300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [720] - recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml" + recipe: "dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml" prefill: num-worker: 1 tp: 4 @@ -1165,7 +1165,7 @@ dsr1-fp8-b300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [11264] - recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml" + recipe: "dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml" prefill: num-worker: 3 tp: 4 @@ -1181,7 +1181,7 @@ dsr1-fp8-b300-dynamo-trt: osl: 1024 search-space: - conc-list: [2112] - recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml" + recipe: "dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml" prefill: num-worker: 1 tp: 4 @@ -1193,7 +1193,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 8 dp-attn: true - conc-list: [3072] - recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml" + recipe: "dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml" prefill: num-worker: 1 tp: 4 @@ -1205,7 +1205,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true - conc-list: [1280] - recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml" + recipe: "dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml" prefill: num-worker: 1 tp: 4 @@ -1217,7 +1217,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true - conc-list: [12] - recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml" + recipe: "dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml" prefill: num-worker: 1 tp: 4 @@ -1229,7 +1229,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: false - conc-list: [128] - recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml" + recipe: "dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml" prefill: num-worker: 1 tp: 4 @@ -1241,7 +1241,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: false - conc-list: [384] - recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml" + recipe: "dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml" prefill: num-worker: 1 tp: 4 @@ -1253,7 +1253,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: false - conc-list: [16384] - recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml" + recipe: "dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml" prefill: num-worker: 2 tp: 4 @@ -1270,7 +1270,7 @@ dsr1-fp8-b300-dynamo-trt: search-space: - spec-decoding: "mtp" conc-list: [40] - recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml" + recipe: "dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml" prefill: num-worker: 1 tp: 4 @@ -1283,7 +1283,7 @@ dsr1-fp8-b300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [8] - recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml" + recipe: "dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml" prefill: num-worker: 1 tp: 4 @@ -1296,7 +1296,7 @@ dsr1-fp8-b300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [20] - recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml" + recipe: "dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml" prefill: num-worker: 1 tp: 4 @@ -1309,7 +1309,7 @@ dsr1-fp8-b300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [72] - recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml" + recipe: "dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml" prefill: num-worker: 1 tp: 4 @@ -1322,7 +1322,7 @@ dsr1-fp8-b300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [144] - recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml" + recipe: "dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml" prefill: num-worker: 2 tp: 4 @@ -1335,7 +1335,7 @@ dsr1-fp8-b300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [512] - recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml" + recipe: "dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml" prefill: num-worker: 4 tp: 4 @@ -1351,7 +1351,7 @@ dsr1-fp8-b300-dynamo-trt: osl: 1024 search-space: - conc-list: [64] - recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml" + recipe: "dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml" prefill: num-worker: 1 tp: 4 @@ -1363,7 +1363,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: false - conc-list: [16] - recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml" + recipe: "dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml" prefill: num-worker: 1 tp: 4 @@ -1375,7 +1375,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: false - conc-list: [256] - recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml" + recipe: "dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml" prefill: num-worker: 2 tp: 4 @@ -1387,7 +1387,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true - conc-list: [512] - recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml" + recipe: "dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml" prefill: num-worker: 3 tp: 4 @@ -1399,7 +1399,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true - conc-list: [256] - recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml" + recipe: "dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml" prefill: num-worker: 3 tp: 4 @@ -1411,7 +1411,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: false - conc-list: [1075] - recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml" + recipe: "dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml" prefill: num-worker: 5 tp: 4 @@ -1423,7 +1423,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true - conc-list: [3072] - recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml" + recipe: "dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml" prefill: num-worker: 7 tp: 4 @@ -2440,7 +2440,7 @@ dsr1-fp8-h200-dynamo-trt: # MTP configurations - spec-decoding: "mtp" conc-list: [1] - recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 8 @@ -2453,7 +2453,7 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [4] - recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 8 @@ -2466,7 +2466,7 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [8] - recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 8 @@ -2479,7 +2479,7 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [16] - recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 8 @@ -2492,7 +2492,7 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [32] - recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 8 @@ -2505,7 +2505,7 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [64] - recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 8 @@ -2518,7 +2518,7 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [128] - recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 8 @@ -2531,7 +2531,7 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [256] - recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 8 @@ -2544,7 +2544,7 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [512] - recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml" + recipe: "dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml" prefill: num-worker: 1 tp: 8 @@ -2557,7 +2557,7 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: true # Non-MTP configurations (STP) - conc-list: [1] - recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 @@ -2569,7 +2569,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [4] - recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 @@ -2581,7 +2581,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [8] - recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 @@ -2593,7 +2593,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [16] - recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 @@ -2605,7 +2605,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [32] - recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 @@ -2617,7 +2617,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [64] - recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 @@ -2629,7 +2629,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [128] - recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 @@ -2641,7 +2641,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [256] - recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 @@ -2653,7 +2653,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [512] - recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 8 @@ -2670,7 +2670,7 @@ dsr1-fp8-h200-dynamo-trt: # MTP configurations - spec-decoding: "mtp" conc-list: [1] - recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 8 @@ -2683,7 +2683,7 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [4] - recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 8 @@ -2696,7 +2696,7 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [8] - recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 8 @@ -2709,7 +2709,7 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [16] - recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml" + recipe: "dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml" prefill: num-worker: 1 tp: 8 @@ -2722,7 +2722,7 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [32] - recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml" prefill: num-worker: 3 tp: 8 @@ -2735,7 +2735,7 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [64] - recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml" + recipe: "dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml" prefill: num-worker: 1 tp: 8 @@ -2748,7 +2748,7 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [128] - recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml" + recipe: "dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml" prefill: num-worker: 2 tp: 8 @@ -2761,7 +2761,7 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [256] - recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml" + recipe: "dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml" prefill: num-worker: 3 tp: 8 @@ -2774,7 +2774,7 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [512] - recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml" + recipe: "dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml" prefill: num-worker: 3 tp: 8 @@ -2787,7 +2787,7 @@ dsr1-fp8-h200-dynamo-trt: dp-attn: true # Non-MTP configurations (STP) - conc-list: [1] - recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 @@ -2799,7 +2799,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [4] - recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 @@ -2811,7 +2811,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [8] - recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 @@ -2823,7 +2823,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [16] - recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 @@ -2835,7 +2835,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [32] - recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 8 @@ -2847,7 +2847,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [64] - recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 8 @@ -2859,7 +2859,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [128] - recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 8 @@ -2871,7 +2871,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [256] - recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml" prefill: num-worker: 5 tp: 8 @@ -2883,7 +2883,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [512] - recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml" prefill: num-worker: 3 tp: 8 @@ -2911,7 +2911,7 @@ dsr1-fp8-h100-dynamo-trt: # MTP configurations - spec-decoding: "mtp" conc-list: [6] - recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 16 @@ -2924,7 +2924,7 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [9] - recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 16 @@ -2937,7 +2937,7 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [30] - recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 16 @@ -2950,7 +2950,7 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [60] - recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 16 @@ -2963,7 +2963,7 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [117] - recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 16 @@ -2976,7 +2976,7 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [231] - recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 16 @@ -2989,7 +2989,7 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [462] - recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 16 @@ -3002,7 +3002,7 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [615] - recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml" + recipe: "dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml" prefill: num-worker: 1 tp: 16 @@ -3015,7 +3015,7 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [1229] - recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml" + recipe: "dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml" prefill: num-worker: 1 tp: 16 @@ -3028,7 +3028,7 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: true # Non-MTP configurations (STP) - conc-list: [6] - recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 16 @@ -3040,7 +3040,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: false - conc-list: [9] - recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 16 @@ -3052,7 +3052,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: false - conc-list: [30] - recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 16 @@ -3064,7 +3064,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: false - conc-list: [60] - recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 16 @@ -3076,7 +3076,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: false - conc-list: [231] - recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 16 @@ -3088,7 +3088,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true - conc-list: [462] - recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 16 @@ -3100,7 +3100,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true - conc-list: [924] - recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 16 @@ -3112,7 +3112,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true - conc-list: [1845] - recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 16 @@ -3124,7 +3124,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true - conc-list: [4916] - recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 16 @@ -3141,7 +3141,7 @@ dsr1-fp8-h100-dynamo-trt: # MTP configurations (6 points) - spec-decoding: "mtp" conc-list: [6] - recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 16 @@ -3154,7 +3154,7 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [9] - recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 16 @@ -3167,7 +3167,7 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [30] - recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 16 @@ -3180,7 +3180,7 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [77] - recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 16 @@ -3195,7 +3195,7 @@ dsr1-fp8-h100-dynamo-trt: # https://github.com/InferenceMAX/InferenceMAX/actions/runs/21769314582/job/62813105509 # - spec-decoding: "mtp" # conc-list: [78] - # recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml" + # recipe: "dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml" # prefill: # num-worker: 1 # tp: 16 @@ -3208,7 +3208,7 @@ dsr1-fp8-h100-dynamo-trt: # dp-attn: false - spec-decoding: "mtp" conc-list: [154] - recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml" prefill: num-worker: 2 tp: 16 @@ -3221,7 +3221,7 @@ dsr1-fp8-h100-dynamo-trt: dp-attn: true # STP configurations (5 points) - conc-list: [6] - recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 16 @@ -3233,7 +3233,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: false - conc-list: [9] - recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 16 @@ -3245,7 +3245,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: false - conc-list: [30] - recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 16 @@ -3257,7 +3257,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: false - conc-list: [154] - recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 16 @@ -3269,7 +3269,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: false - conc-list: [308] - recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 16 @@ -3494,7 +3494,7 @@ dsr1-fp8-h100-dynamo-sglang: search-space: # # STP: Max throughput TEP (1 prefill, 2 decode) # - conc-list: [1, 2, 4, 8, 16, 32, 64, 128] - # recipe: "dsr1/sglang/h100-fp8/1k/1k/disagg/stp/h100-fp8-1p2d-max-tp.yaml" + # recipe: "dsr1/sglang/h100-fp8/1k1k/disagg/stp/h100-fp8-1p2d-max-tp.yaml" # prefill: # num-worker: 1 # tp: 16 @@ -3507,7 +3507,7 @@ dsr1-fp8-h100-dynamo-sglang: # dp-attn: false # # STP: Max throughput DEP (1 prefill, 1 decode, dp-attention) # - conc-list: [1, 2, 4, 8, 16, 32, 64] - # recipe: "dsr1/sglang/h100-fp8/1k/1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml" + # recipe: "dsr1/sglang/h100-fp8/1k1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml" # prefill: # num-worker: 1 # tp: 16 @@ -3521,7 +3521,7 @@ dsr1-fp8-h100-dynamo-sglang: # MTP: Max throughput TEP (1 prefill, 2 decode) - spec-decoding: "mtp" conc-list: [1, 2, 4, 8, 16, 32, 64, 128] - recipe: "dsr1/sglang/h100-fp8/1k/1k/disagg/mtp/h100-fp8-1p2d-max-tp-mtp.yaml" + recipe: "dsr1/sglang/h100-fp8/1k1k/disagg/mtp/h100-fp8-1p2d-max-tp-mtp.yaml" prefill: num-worker: 1 tp: 16 @@ -3535,7 +3535,7 @@ dsr1-fp8-h100-dynamo-sglang: # MTP: Max throughput DEP (1 prefill, 1 decode, dp-attention) - spec-decoding: "mtp" conc-list: [1, 2, 4, 8, 16, 32, 64] - recipe: "dsr1/sglang/h100-fp8/1k/1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml" + recipe: "dsr1/sglang/h100-fp8/1k1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml" prefill: num-worker: 1 tp: 16 @@ -3551,7 +3551,7 @@ dsr1-fp8-h100-dynamo-sglang: search-space: # # STP: Max throughput TEP (1 prefill, 1 decode) # - conc-list: [1, 2, 4, 8, 16, 32, 64, 128] - # recipe: "dsr1/sglang/h100-fp8/8k/1k/disagg/stp/h100-fp8-1p1d-max-tp.yaml" + # recipe: "dsr1/sglang/h100-fp8/8k1k/disagg/stp/h100-fp8-1p1d-max-tp.yaml" # prefill: # num-worker: 1 # tp: 16 @@ -3564,7 +3564,7 @@ dsr1-fp8-h100-dynamo-sglang: # dp-attn: false # # STP: Max throughput DEP (1 prefill, 1 decode, dp-attention) # - conc-list: [1, 2, 4, 8, 16, 32, 64] - # recipe: "dsr1/sglang/h100-fp8/8k/1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml" + # recipe: "dsr1/sglang/h100-fp8/8k1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml" # prefill: # num-worker: 1 # tp: 16 @@ -3578,7 +3578,7 @@ dsr1-fp8-h100-dynamo-sglang: # MTP: Max throughput TEP (1 prefill, 1 decode) - spec-decoding: "mtp" conc-list: [1, 2, 4, 8, 16, 32, 64, 128] - recipe: "dsr1/sglang/h100-fp8/8k/1k/disagg/mtp/h100-fp8-1p1d-max-tp-mtp.yaml" + recipe: "dsr1/sglang/h100-fp8/8k1k/disagg/mtp/h100-fp8-1p1d-max-tp-mtp.yaml" prefill: num-worker: 1 tp: 16 @@ -3592,7 +3592,7 @@ dsr1-fp8-h100-dynamo-sglang: # MTP: Max throughput DEP (1 prefill, 1 decode, dp-attention) - spec-decoding: "mtp" conc-list: [1, 2, 4, 8, 16, 32, 64] - recipe: "dsr1/sglang/h100-fp8/8k/1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml" + recipe: "dsr1/sglang/h100-fp8/8k1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml" prefill: num-worker: 1 tp: 16 @@ -3687,7 +3687,7 @@ dsr1-fp4-gb200-dynamo-trt: # MTP configurations (spec_decoding="mtp") - spec-decoding: "mtp" conc-list: [ 180 ] - recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 4 @@ -3700,7 +3700,7 @@ dsr1-fp4-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [ 4, 8, 12, 24, 48 ] - recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 4 @@ -3713,7 +3713,7 @@ dsr1-fp4-gb200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [ 4301 ] - recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml" + recipe: "dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml" prefill: num-worker: 2 tp: 4 @@ -3726,7 +3726,7 @@ dsr1-fp4-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [ 2253 ] - recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml" + recipe: "dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml" prefill: num-worker: 3 tp: 4 @@ -3739,7 +3739,7 @@ dsr1-fp4-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [ 16130 ] - recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml" + recipe: "dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml" prefill: num-worker: 3 tp: 4 @@ -3754,7 +3754,7 @@ dsr1-fp4-gb200-dynamo-trt: # Non-MTP configurations (default spec_decoding="none") - conc-list: [ 4301 ] - recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -3766,7 +3766,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [ 666 ] - recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -3778,7 +3778,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 32 dp-attn: true - conc-list: [ 6144 ] - recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -3790,7 +3790,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true - conc-list: [ 12, 24, 48, 96, 192 ] - recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -3802,7 +3802,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [ 5 ] - recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -3814,7 +3814,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [ 4301 ] - recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml" + recipe: "dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml" prefill: num-worker: 2 tp: 4 @@ -3826,7 +3826,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 16 dp-attn: true - conc-list: [ 2253 ] - recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 4 @@ -3844,7 +3844,7 @@ dsr1-fp4-gb200-dynamo-trt: # MTP configurations (spec_decoding="mtp") - spec-decoding: "mtp" conc-list: [ 4, 8, 12, 24, 48 ] - recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 4 @@ -3857,7 +3857,7 @@ dsr1-fp4-gb200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [ 180 ] - recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml" prefill: num-worker: 3 tp: 4 @@ -3870,7 +3870,7 @@ dsr1-fp4-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [ 1229 ] - recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml" + recipe: "dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml" prefill: num-worker: 7 tp: 4 @@ -3883,7 +3883,7 @@ dsr1-fp4-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [ 666 ] - recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml" prefill: num-worker: 8 tp: 4 @@ -3896,7 +3896,7 @@ dsr1-fp4-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [ 4301 ] - recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml" + recipe: "dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml" prefill: num-worker: 11 tp: 4 @@ -3910,7 +3910,7 @@ dsr1-fp4-gb200-dynamo-trt: # Non-MTP configurations (default spec_decoding="none") - conc-list: [ 12, 44, 76 ] - recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -3922,7 +3922,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [ 5 ] - recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -3934,7 +3934,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [ 333 ] - recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 4 @@ -3946,7 +3946,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 32 dp-attn: true - conc-list: [ 1229 ] - recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml" prefill: num-worker: 7 tp: 4 @@ -3958,7 +3958,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 32 dp-attn: true - conc-list: [ 2253 ] - recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml" prefill: num-worker: 8 tp: 4 @@ -3970,7 +3970,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 16 dp-attn: true - conc-list: [ 4096 ] - recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml" + recipe: "dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml" prefill: num-worker: 10 tp: 4 @@ -3999,7 +3999,7 @@ dsr1-fp8-gb200-dynamo-trt: search-space: - spec-decoding: "mtp" conc-list: [4301] - recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml" + recipe: "dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml" prefill: num-worker: 1 tp: 8 @@ -4012,7 +4012,7 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [2151] - recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml" + recipe: "dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml" prefill: num-worker: 1 tp: 8 @@ -4025,7 +4025,7 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [1229] - recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml" + recipe: "dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml" prefill: num-worker: 1 tp: 8 @@ -4038,7 +4038,7 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [615] - recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml" + recipe: "dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml" prefill: num-worker: 1 tp: 8 @@ -4051,7 +4051,7 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [36] - recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml" + recipe: "dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml" prefill: num-worker: 1 tp: 8 @@ -4064,7 +4064,7 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [18] - recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml" + recipe: "dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml" prefill: num-worker: 1 tp: 8 @@ -4077,7 +4077,7 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [9] - recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml" + recipe: "dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml" prefill: num-worker: 1 tp: 8 @@ -4090,7 +4090,7 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: false # 1k1k STP configs - conc-list: [6144] - recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml" + recipe: "dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml" prefill: num-worker: 1 tp: 8 @@ -4102,7 +4102,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [4301] - recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml" + recipe: "dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml" prefill: num-worker: 1 tp: 8 @@ -4114,7 +4114,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [2151] - recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml" + recipe: "dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml" prefill: num-worker: 1 tp: 8 @@ -4126,7 +4126,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 16 dp-attn: true - conc-list: [1127] - recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml" + recipe: "dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml" prefill: num-worker: 1 tp: 8 @@ -4138,7 +4138,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 32 dp-attn: true - conc-list: [256] - recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml" + recipe: "dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml" prefill: num-worker: 1 tp: 8 @@ -4150,7 +4150,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 32 dp-attn: true - conc-list: [27] - recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml" + recipe: "dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml" prefill: num-worker: 1 tp: 8 @@ -4162,7 +4162,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [3] - recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml" + recipe: "dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml" prefill: num-worker: 1 tp: 8 @@ -4179,7 +4179,7 @@ dsr1-fp8-gb200-dynamo-trt: search-space: - spec-decoding: "mtp" conc-list: [666] - recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml" + recipe: "dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml" prefill: num-worker: 3 tp: 8 @@ -4192,7 +4192,7 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [666] - recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml" + recipe: "dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml" prefill: num-worker: 5 tp: 8 @@ -4205,7 +4205,7 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [333] - recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml" + recipe: "dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml" prefill: num-worker: 3 tp: 8 @@ -4218,7 +4218,7 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [333] - recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml" + recipe: "dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml" prefill: num-worker: 4 tp: 8 @@ -4231,7 +4231,7 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [90] - recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml" + recipe: "dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml" prefill: num-worker: 2 tp: 8 @@ -4244,7 +4244,7 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [15] - recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml" + recipe: "dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml" prefill: num-worker: 1 tp: 8 @@ -4257,7 +4257,7 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [6] - recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml" + recipe: "dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml" prefill: num-worker: 1 tp: 8 @@ -4270,7 +4270,7 @@ dsr1-fp8-gb200-dynamo-trt: dp-attn: false # 8k1k STP configs - conc-list: [1229] - recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml" + recipe: "dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml" prefill: num-worker: 5 tp: 8 @@ -4282,7 +4282,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 16 dp-attn: true - conc-list: [666] - recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml" + recipe: "dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml" prefill: num-worker: 4 tp: 8 @@ -4294,7 +4294,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 32 dp-attn: true - conc-list: [615] - recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml" + recipe: "dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml" prefill: num-worker: 3 tp: 8 @@ -4306,7 +4306,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 16 dp-attn: true - conc-list: [333] - recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml" + recipe: "dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml" prefill: num-worker: 2 tp: 8 @@ -4318,7 +4318,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 32 dp-attn: true - conc-list: [63] - recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml" + recipe: "dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml" prefill: num-worker: 1 tp: 8 @@ -4330,7 +4330,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [18] - recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml" + recipe: "dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml" prefill: num-worker: 1 tp: 8 @@ -4342,7 +4342,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [6] - recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml" + recipe: "dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml" prefill: num-worker: 1 tp: 8 @@ -4370,7 +4370,7 @@ dsr1-fp8-gb200-dynamo-sglang: search-space: # "Low latency" (1 prefill worker at TP4 and 1 decode worker at TP4) - conc-list: [4, 8] - recipe: "dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/low-latency.yaml" + recipe: "dsr1/sglang/gb200-fp8/1k1k/disagg/stp/low-latency.yaml" prefill: num-worker: 1 tp: 4 @@ -4384,7 +4384,7 @@ dsr1-fp8-gb200-dynamo-sglang: # "Mid curve" (3 prefill workers at DEP8 and 1 decode worker at DEP48) - conc-list: [1024, 2048, 4096] - recipe: "dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/mid-curve.yaml" + recipe: "dsr1/sglang/gb200-fp8/1k1k/disagg/stp/mid-curve.yaml" prefill: num-worker: 3 tp: 8 @@ -4398,7 +4398,7 @@ dsr1-fp8-gb200-dynamo-sglang: # "Max throughput" (2 prefill workers at DEP8 and 1 decode worker at DEP32) - conc-list: [1024, 2048, 4096, 6144] - recipe: "dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/max-tpt.yaml" + recipe: "dsr1/sglang/gb200-fp8/1k1k/disagg/stp/max-tpt.yaml" prefill: num-worker: 2 tp: 8 @@ -4412,7 +4412,7 @@ dsr1-fp8-gb200-dynamo-sglang: # "Ultra throughput" (1 prefill workers at DEP8 and 1 decode worker at DEP8) - conc-list: [4096] - recipe: "dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/ultra-tpt.yaml" + recipe: "dsr1/sglang/gb200-fp8/1k1k/disagg/stp/ultra-tpt.yaml" prefill: num-worker: 1 tp: 8 @@ -4429,7 +4429,7 @@ dsr1-fp8-gb200-dynamo-sglang: search-space: # "Low latency" (1 prefill worker at TP8 and 1 decode worker at TP8) - conc-list: [4, 8, 16] - recipe: "dsr1/sglang/gb200-fp8/8k/1k/disagg/stp/low-latency.yaml" + recipe: "dsr1/sglang/gb200-fp8/8k1k/disagg/stp/low-latency.yaml" prefill: num-worker: 1 tp: 8 @@ -4443,7 +4443,7 @@ dsr1-fp8-gb200-dynamo-sglang: # "Mid curve" (5 prefill workers at DEP8 and 1 decode worker at DEP32) - conc-list: [512, 1024, 2048, 6144] - recipe: "dsr1/sglang/gb200-fp8/8k/1k/disagg/stp/mid-curve.yaml" + recipe: "dsr1/sglang/gb200-fp8/8k1k/disagg/stp/mid-curve.yaml" prefill: num-worker: 5 tp: 8 @@ -4457,7 +4457,7 @@ dsr1-fp8-gb200-dynamo-sglang: # "Max throughput" (6 prefill workers at DEP8 and 1 decode worker at DEP24) - conc-list: [2048, 4096, 6144] - recipe: "dsr1/sglang/gb200-fp8/8k/1k/disagg/stp/max_tpt.yaml" + recipe: "dsr1/sglang/gb200-fp8/8k1k/disagg/stp/max_tpt.yaml" prefill: num-worker: 6 tp: 8 @@ -4484,7 +4484,7 @@ dsr1-fp8-gb300-dynamo-sglang: search-space: # "Low latency" (1 prefill worker at TP4 and 4 decode workers at TP4) - conc-list: [4, 8, 16, 32] - recipe: "dsr1/sglang/gb300-fp8/1k/1k/disagg/stp/low-latency.yaml" + recipe: "dsr1/sglang/gb300-fp8/1k1k/disagg/stp/low-latency.yaml" prefill: num-worker: 1 tp: 4 @@ -4498,7 +4498,7 @@ dsr1-fp8-gb300-dynamo-sglang: # "Mid curve" (2 prefill workers at DEP8 and 1 decode worker at DEP32) - conc-list: [1024, 2048, 4096, 6144] - recipe: "dsr1/sglang/gb300-fp8/1k/1k/disagg/stp/mid.yaml" + recipe: "dsr1/sglang/gb300-fp8/1k1k/disagg/stp/mid.yaml" prefill: num-worker: 2 tp: 8 @@ -4512,7 +4512,7 @@ dsr1-fp8-gb300-dynamo-sglang: # "Max throughput" (1 prefill worker at DEP8 and 1 decode worker at DEP8) - conc-list: [4096, 7168, 7680] - recipe: "dsr1/sglang/gb300-fp8/1k/1k/disagg/stp/max.yaml" + recipe: "dsr1/sglang/gb300-fp8/1k1k/disagg/stp/max.yaml" prefill: num-worker: 1 tp: 8 @@ -4529,7 +4529,7 @@ dsr1-fp8-gb300-dynamo-sglang: search-space: # "Low latency" (1 prefill worker at TP4 and 1 decode worker at TP4) - conc-list: [4, 8] - recipe: "dsr1/sglang/gb300-fp8/8k/1k/disagg/stp/low-latency.yaml" + recipe: "dsr1/sglang/gb300-fp8/8k1k/disagg/stp/low-latency.yaml" prefill: num-worker: 1 tp: 4 @@ -4543,7 +4543,7 @@ dsr1-fp8-gb300-dynamo-sglang: # "Mid curve" (5 prefill workers at DEP8 and 1 decode worker at DEP32) - conc-list: [128, 256, 512, 1024] - recipe: "dsr1/sglang/gb300-fp8/8k/1k/disagg/stp/mid.yaml" + recipe: "dsr1/sglang/gb300-fp8/8k1k/disagg/stp/mid.yaml" prefill: num-worker: 5 tp: 8 @@ -4557,7 +4557,7 @@ dsr1-fp8-gb300-dynamo-sglang: # "Max throughput" (6 prefill workers at DEP8 and 1 decode worker at DEP24) - conc-list: [2048, 4096] - recipe: "dsr1/sglang/gb300-fp8/8k/1k/disagg/stp/max.yaml" + recipe: "dsr1/sglang/gb300-fp8/8k1k/disagg/stp/max.yaml" prefill: num-worker: 6 tp: 8 @@ -4586,7 +4586,7 @@ dsr1-fp4-gb200-dynamo-sglang: # Low latency (1 prefill node, 2 decode nodes) - spec-decoding: "none" conc-list: [ 4, 8, 32 ] - recipe: "dsr1/sglang/gb200-fp4/1k/1k/disagg/stp/low-latency.yaml" + recipe: "dsr1/sglang/gb200-fp4/1k1k/disagg/stp/low-latency.yaml" prefill: num-worker: 1 tp: 4 @@ -4601,7 +4601,7 @@ dsr1-fp4-gb200-dynamo-sglang: # Mid curve (4 prefill nodes, 8 decode nodes) - spec-decoding: "none" conc-list: [ 512, 2048, 4096, 8192 ] - recipe: "dsr1/sglang/gb200-fp4/1k/1k/disagg/stp/mid-curve.yaml" + recipe: "dsr1/sglang/gb200-fp4/1k1k/disagg/stp/mid-curve.yaml" prefill: num-worker: 4 tp: 4 @@ -4616,7 +4616,7 @@ dsr1-fp4-gb200-dynamo-sglang: # Max throughput (4 prefill nodes, 12 decode nodes) - spec-decoding: "none" conc-list: [ 2048, 4096 ] - recipe: "dsr1/sglang/gb200-fp4/1k/1k/disagg/stp/max-tpt.yaml" + recipe: "dsr1/sglang/gb200-fp4/1k1k/disagg/stp/max-tpt.yaml" prefill: num-worker: 4 tp: 4 @@ -4635,7 +4635,7 @@ dsr1-fp4-gb200-dynamo-sglang: # Low latency (1 prefill node, 4 decode nodes) - spec-decoding: "none" conc-list: [ 4, 8 ] - recipe: "dsr1/sglang/gb200-fp4/8k/1k/disagg/stp/low-latency.yaml" + recipe: "dsr1/sglang/gb200-fp4/8k1k/disagg/stp/low-latency.yaml" prefill: num-worker: 1 tp: 4 @@ -4650,7 +4650,7 @@ dsr1-fp4-gb200-dynamo-sglang: # Mid curve (6 prefill nodes, 12 decode nodes) - spec-decoding: "none" conc-list: [ 512, 2048, 4096 ] - recipe: "dsr1/sglang/gb200-fp4/8k/1k/disagg/stp/mid-curve.yaml" + recipe: "dsr1/sglang/gb200-fp4/8k1k/disagg/stp/mid-curve.yaml" prefill: num-worker: 6 tp: 4 @@ -4665,7 +4665,7 @@ dsr1-fp4-gb200-dynamo-sglang: # Max throughput (10 prefill nodes, 8 decode nodes) - spec-decoding: "none" conc-list: [ 2048 ] - recipe: "dsr1/sglang/gb200-fp4/8k/1k/disagg/stp/max-tpt.yaml" + recipe: "dsr1/sglang/gb200-fp4/8k1k/disagg/stp/max-tpt.yaml" prefill: num-worker: 10 tp: 4 @@ -4693,7 +4693,7 @@ dsr1-fp4-gb300-dynamo-trt: # MTP configurations - spec-decoding: "mtp" conc-list: [3226] - recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml" + recipe: "dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml" prefill: num-worker: 1 tp: 2 @@ -4706,7 +4706,7 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [333] - recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml" + recipe: "dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml" prefill: num-worker: 1 tp: 2 @@ -4719,7 +4719,7 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [5] - recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 2 @@ -4732,7 +4732,7 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [8, 12, 24, 48] - recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 2 @@ -4745,7 +4745,7 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [2253] - recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml" + recipe: "dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml" prefill: num-worker: 3 tp: 2 @@ -4758,7 +4758,7 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [1229] - recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml" + recipe: "dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml" prefill: num-worker: 3 tp: 2 @@ -4771,7 +4771,7 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: true # Non-MTP configurations (default spec_decoding="none") - conc-list: [5] - recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 @@ -4783,7 +4783,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 8 dp-attn: false - conc-list: [12, 48, 96, 192] - recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 @@ -4795,7 +4795,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 8 dp-attn: false - conc-list: [8192] - recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 2 @@ -4807,7 +4807,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 8 dp-attn: true - conc-list: [1229] - recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 2 @@ -4819,7 +4819,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 32 dp-attn: true - conc-list: [4301] - recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml" + recipe: "dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml" prefill: num-worker: 3 tp: 2 @@ -4831,7 +4831,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 16 dp-attn: true - conc-list: [2253] - recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml" prefill: num-worker: 3 tp: 2 @@ -4848,7 +4848,7 @@ dsr1-fp4-gb300-dynamo-trt: # MTP configurations (spec_decoding="mtp") - spec-decoding: "mtp" conc-list: [33] - recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 2 @@ -4861,7 +4861,7 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [5] - recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 2 @@ -4874,7 +4874,7 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [12, 24] - recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml" prefill: num-worker: 1 tp: 2 @@ -4887,7 +4887,7 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [180] - recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml" prefill: num-worker: 4 tp: 2 @@ -4900,7 +4900,7 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [308] - recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml" prefill: num-worker: 8 tp: 2 @@ -4913,7 +4913,7 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [2253] - recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml" + recipe: "dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml" prefill: num-worker: 10 tp: 2 @@ -4926,7 +4926,7 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [666] - recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml" + recipe: "dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml" prefill: num-worker: 10 tp: 2 @@ -4939,7 +4939,7 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [1127] - recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml" + recipe: "dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml" prefill: num-worker: 13 tp: 2 @@ -4952,7 +4952,7 @@ dsr1-fp4-gb300-dynamo-trt: dp-attn: true # Non-MTP configurations (default spec_decoding="none") - conc-list: [72] - recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 @@ -4964,7 +4964,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 8 dp-attn: false - conc-list: [5] - recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 @@ -4976,7 +4976,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 8 dp-attn: false - conc-list: [12] - recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 @@ -4988,7 +4988,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 8 dp-attn: false - conc-list: [5, 15, 30] - recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 2 @@ -5000,7 +5000,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 4 dp-attn: false - conc-list: [666] - recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml" prefill: num-worker: 7 tp: 2 @@ -5012,7 +5012,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 32 dp-attn: true - conc-list: [1229] - recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml" prefill: num-worker: 9 tp: 2 @@ -5024,7 +5024,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 16 dp-attn: true - conc-list: [3228] - recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml" prefill: num-worker: 11 tp: 2 @@ -5036,7 +5036,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 4 dp-attn: true - conc-list: [2253] - recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml" + recipe: "dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml" prefill: num-worker: 14 tp: 2 @@ -5065,7 +5065,7 @@ dsr1-fp4-gb300-dynamo-sglang: # Low latency (1 prefill node, 2 decode nodes) - spec-decoding: "none" conc-list: [ 4, 8, 32 ] - recipe: "dsr1/sglang/gb300-fp4/1k/1k/disagg/stp/low_latency.yaml" + recipe: "dsr1/sglang/gb300-fp4/1k1k/disagg/stp/low_latency.yaml" prefill: num-worker: 1 tp: 4 @@ -5080,7 +5080,7 @@ dsr1-fp4-gb300-dynamo-sglang: # Mid curve (4 prefill nodes, 8 decode nodes) - spec-decoding: "none" conc-list: [ 512, 2048, 4096, 8192 ] - recipe: "dsr1/sglang/gb300-fp4/1k/1k/disagg/stp/mid_curve.yaml" + recipe: "dsr1/sglang/gb300-fp4/1k1k/disagg/stp/mid_curve.yaml" prefill: num-worker: 4 tp: 4 @@ -5095,7 +5095,7 @@ dsr1-fp4-gb300-dynamo-sglang: # Max throughput (4 prefill nodes, 12 decode nodes) - spec-decoding: "none" conc-list: [ 512, 2048, 4096, 8192 ] - recipe: "dsr1/sglang/gb300-fp4/1k/1k/disagg/stp/max_tpt.yaml" + recipe: "dsr1/sglang/gb300-fp4/1k1k/disagg/stp/max_tpt.yaml" prefill: num-worker: 4 tp: 4 @@ -5114,7 +5114,7 @@ dsr1-fp4-gb300-dynamo-sglang: # Low latency (1 prefill node, 4 decode nodes) - spec-decoding: "none" conc-list: [ 4, 8, 32, 64 ] - recipe: "dsr1/sglang/gb300-fp4/8k/1k/disagg/stp/low_latency.yaml" + recipe: "dsr1/sglang/gb300-fp4/8k1k/disagg/stp/low_latency.yaml" prefill: num-worker: 1 tp: 4 @@ -5129,7 +5129,7 @@ dsr1-fp4-gb300-dynamo-sglang: # Mid curve (6 prefill nodes, 12 decode nodes) - spec-decoding: "none" conc-list: [ 512, 2048, 4096 ] - recipe: "dsr1/sglang/gb300-fp4/8k/1k/disagg/stp/mid_curve.yaml" + recipe: "dsr1/sglang/gb300-fp4/8k1k/disagg/stp/mid_curve.yaml" prefill: num-worker: 6 tp: 4 @@ -5144,7 +5144,7 @@ dsr1-fp4-gb300-dynamo-sglang: # Max throughput (10 prefill nodes, 8 decode nodes) - spec-decoding: "none" conc-list: [ 2048 ] - recipe: "dsr1/sglang/gb300-fp4/8k/1k/disagg/stp/max_tpt.yaml" + recipe: "dsr1/sglang/gb300-fp4/8k1k/disagg/stp/max_tpt.yaml" prefill: num-worker: 10 tp: 4 @@ -5172,7 +5172,7 @@ dsr1-fp8-gb300-dynamo-trt: # MTP configurations (spec_decoding="mtp") - spec-decoding: "mtp" conc-list: [8] - recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml" + recipe: "dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml" prefill: num-worker: 1 tp: 4 @@ -5185,7 +5185,7 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [24] - recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml" + recipe: "dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml" prefill: num-worker: 1 tp: 4 @@ -5198,7 +5198,7 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [180] - recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml" + recipe: "dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml" prefill: num-worker: 1 tp: 4 @@ -5211,7 +5211,7 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [564] - recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml" + recipe: "dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml" prefill: num-worker: 2 tp: 4 @@ -5224,7 +5224,7 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [666] - recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml" + recipe: "dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml" prefill: num-worker: 1 tp: 4 @@ -5237,7 +5237,7 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [2253] - recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml" + recipe: "dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml" prefill: num-worker: 2 tp: 4 @@ -5250,7 +5250,7 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [8192] - recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml" + recipe: "dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml" prefill: num-worker: 3 tp: 4 @@ -5263,7 +5263,7 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: true # STP configurations (no spec_decoding) - conc-list: [4] - recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml" + recipe: "dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml" prefill: num-worker: 1 tp: 4 @@ -5275,7 +5275,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 8 dp-attn: false - conc-list: [24] - recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml" + recipe: "dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml" prefill: num-worker: 1 tp: 4 @@ -5287,7 +5287,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 8 dp-attn: false - conc-list: [84] - recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml" + recipe: "dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml" prefill: num-worker: 1 tp: 4 @@ -5299,7 +5299,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 8 dp-attn: false - conc-list: [1229] - recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml" + recipe: "dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml" prefill: num-worker: 2 tp: 4 @@ -5311,7 +5311,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 32 dp-attn: true - conc-list: [2253] - recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml" + recipe: "dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml" prefill: num-worker: 2 tp: 4 @@ -5323,7 +5323,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 16 dp-attn: true - conc-list: [8602] - recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml" + recipe: "dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml" prefill: num-worker: 3 tp: 4 @@ -5335,7 +5335,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 8 dp-attn: true - conc-list: [12288] - recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml" + recipe: "dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml" prefill: num-worker: 3 tp: 4 @@ -5352,7 +5352,7 @@ dsr1-fp8-gb300-dynamo-trt: # MTP configurations (spec_decoding="mtp") - spec-decoding: "mtp" conc-list: [8] - recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml" + recipe: "dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml" prefill: num-worker: 1 tp: 4 @@ -5365,7 +5365,7 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [24] - recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml" + recipe: "dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml" prefill: num-worker: 1 tp: 4 @@ -5378,7 +5378,7 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: false - spec-decoding: "mtp" conc-list: [333] - recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml" + recipe: "dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml" prefill: num-worker: 6 tp: 4 @@ -5391,7 +5391,7 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [666] - recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml" + recipe: "dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml" prefill: num-worker: 8 tp: 4 @@ -5404,7 +5404,7 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [1229] - recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml" + recipe: "dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml" prefill: num-worker: 10 tp: 4 @@ -5417,7 +5417,7 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: true - spec-decoding: "mtp" conc-list: [1229] - recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml" + recipe: "dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml" prefill: num-worker: 7 tp: 4 @@ -5430,7 +5430,7 @@ dsr1-fp8-gb300-dynamo-trt: dp-attn: true # STP configurations (no spec_decoding) - conc-list: [4] - recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml" + recipe: "dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml" prefill: num-worker: 1 tp: 4 @@ -5442,7 +5442,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 8 dp-attn: false - conc-list: [24] - recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml" + recipe: "dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml" prefill: num-worker: 1 tp: 4 @@ -5454,7 +5454,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 8 dp-attn: false - conc-list: [36] - recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml" + recipe: "dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml" prefill: num-worker: 1 tp: 4 @@ -5466,7 +5466,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 8 dp-attn: false - conc-list: [512] - recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml" + recipe: "dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml" prefill: num-worker: 6 tp: 4 @@ -5478,7 +5478,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 32 dp-attn: true - conc-list: [666] - recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml" + recipe: "dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml" prefill: num-worker: 4 tp: 4 @@ -5490,7 +5490,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 16 dp-attn: true - conc-list: [1229] - recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml" + recipe: "dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml" prefill: num-worker: 7 tp: 4 @@ -5502,7 +5502,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 16 dp-attn: true - conc-list: [2151] - recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml" + recipe: "dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml" prefill: num-worker: 7 tp: 4 @@ -5800,7 +5800,7 @@ dsr1-fp8-h200-dynamo-sglang: # STP: Low latency (1 prefill, 9 decode, TEP) - spec-decoding: "none" conc-list: [1, 4, 8, 16, 32, 64, 128, 256] - recipe: "dsr1/sglang/h200-fp8/1k/1k/disagg/stp/low-latency-1p9d.yaml" + recipe: "dsr1/sglang/h200-fp8/1k1k/disagg/stp/low-latency-1p9d.yaml" prefill: num-worker: 1 tp: 8 @@ -5814,7 +5814,7 @@ dsr1-fp8-h200-dynamo-sglang: # STP: High throughput TEP (1 prefill, 6 decode) - spec-decoding: "none" conc-list: [512, 1024, 2048] - recipe: "dsr1/sglang/h200-fp8/1k/1k/disagg/stp/bs256-1p6d-tp.yaml" + recipe: "dsr1/sglang/h200-fp8/1k1k/disagg/stp/bs256-1p6d-tp.yaml" prefill: num-worker: 1 tp: 8 @@ -5828,7 +5828,7 @@ dsr1-fp8-h200-dynamo-sglang: # STP: High throughput DEP (1 prefill, 6 decode, dp-attention) - spec-decoding: "none" conc-list: [128, 256, 512, 1024, 2048] - recipe: "dsr1/sglang/h200-fp8/1k/1k/disagg/stp/bs256-1p6d-dep.yaml" + recipe: "dsr1/sglang/h200-fp8/1k1k/disagg/stp/bs256-1p6d-dep.yaml" prefill: num-worker: 1 tp: 8 @@ -5842,7 +5842,7 @@ dsr1-fp8-h200-dynamo-sglang: # MTP: Low latency (1 prefill, 9 decode, TEP) - spec-decoding: "mtp" conc-list: [1, 4, 8, 16, 32, 64, 128, 256] - recipe: "dsr1/sglang/h200-fp8/1k/1k/disagg/mtp/low-latency-1p9d-mtp.yaml" + recipe: "dsr1/sglang/h200-fp8/1k1k/disagg/mtp/low-latency-1p9d-mtp.yaml" prefill: num-worker: 1 tp: 8 @@ -5856,7 +5856,7 @@ dsr1-fp8-h200-dynamo-sglang: # MTP: High throughput TEP (1 prefill, 6 decode) - spec-decoding: "mtp" conc-list: [512, 1024, 2048] - recipe: "dsr1/sglang/h200-fp8/1k/1k/disagg/mtp/bs256-1p6d-tp-mtp.yaml" + recipe: "dsr1/sglang/h200-fp8/1k1k/disagg/mtp/bs256-1p6d-tp-mtp.yaml" prefill: num-worker: 1 tp: 8 @@ -5870,7 +5870,7 @@ dsr1-fp8-h200-dynamo-sglang: # MTP: High throughput DEP (1 prefill, 6 decode, dp-attention) - spec-decoding: "mtp" conc-list: [128, 256, 512, 1024, 2048] - recipe: "dsr1/sglang/h200-fp8/1k/1k/disagg/mtp/bs256-1p6d-dep-mtp.yaml" + recipe: "dsr1/sglang/h200-fp8/1k1k/disagg/mtp/bs256-1p6d-dep-mtp.yaml" prefill: num-worker: 1 tp: 8 @@ -5887,7 +5887,7 @@ dsr1-fp8-h200-dynamo-sglang: # STP: Low latency TEP (1 prefill, 7 decode) - spec-decoding: "none" conc-list: [1, 4, 8] - recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs4-1p7d.yaml" + recipe: "dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs4-1p7d.yaml" prefill: num-worker: 1 tp: 8 @@ -5901,7 +5901,7 @@ dsr1-fp8-h200-dynamo-sglang: # STP: TEP (1 prefill, 6 decode) - spec-decoding: "none" conc-list: [4, 8, 16] - recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs8-1p6d.yaml" + recipe: "dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs8-1p6d.yaml" prefill: num-worker: 1 tp: 8 @@ -5915,7 +5915,7 @@ dsr1-fp8-h200-dynamo-sglang: # STP: TEP (1 prefill, 3 decode) - spec-decoding: "none" conc-list: [8, 16, 32] - recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs16-1p3d.yaml" + recipe: "dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs16-1p3d.yaml" prefill: num-worker: 1 tp: 8 @@ -5929,7 +5929,7 @@ dsr1-fp8-h200-dynamo-sglang: # STP: TEP (2 prefill, 3 decode) - spec-decoding: "none" conc-list: [32, 64, 128] - recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs64-2p3d.yaml" + recipe: "dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs64-2p3d.yaml" prefill: num-worker: 2 tp: 8 @@ -5943,7 +5943,7 @@ dsr1-fp8-h200-dynamo-sglang: # STP: High throughput DEP (1 prefill, 1 decode, dp-attention) - spec-decoding: "none" conc-list: [64, 128, 256] - recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs128-1p1d-dep.yaml" + recipe: "dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs128-1p1d-dep.yaml" prefill: num-worker: 1 tp: 8 @@ -5957,7 +5957,7 @@ dsr1-fp8-h200-dynamo-sglang: # MTP: Low latency TEP (1 prefill, 7 decode) - spec-decoding: "mtp" conc-list: [1, 4, 8] - recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs4-1p7d-mtp.yaml" + recipe: "dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs4-1p7d-mtp.yaml" prefill: num-worker: 1 tp: 8 @@ -5971,7 +5971,7 @@ dsr1-fp8-h200-dynamo-sglang: # MTP: TEP (1 prefill, 6 decode) - spec-decoding: "mtp" conc-list: [2, 4, 8, 16, 32] - recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs8-1p6d-mtp.yaml" + recipe: "dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs8-1p6d-mtp.yaml" prefill: num-worker: 1 tp: 8 @@ -5985,7 +5985,7 @@ dsr1-fp8-h200-dynamo-sglang: # MTP: TEP (1 prefill, 3 decode) - spec-decoding: "mtp" conc-list: [4, 8, 16, 32, 64] - recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs16-1p3d-mtp.yaml" + recipe: "dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs16-1p3d-mtp.yaml" prefill: num-worker: 1 tp: 8 @@ -5999,7 +5999,7 @@ dsr1-fp8-h200-dynamo-sglang: # MTP: TEP (2 prefill, 3 decode) - spec-decoding: "mtp" conc-list: [32, 64, 128] - recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs64-2p3d-mtp.yaml" + recipe: "dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs64-2p3d-mtp.yaml" prefill: num-worker: 2 tp: 8 @@ -6013,7 +6013,7 @@ dsr1-fp8-h200-dynamo-sglang: # MTP: High throughput DEP (1 prefill, 1 decode, dp-attention) - spec-decoding: "mtp" conc-list: [32, 64, 128, 256, 512] - recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs128-1p1d-dep-mtp.yaml" + recipe: "dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs128-1p1d-dep-mtp.yaml" prefill: num-worker: 1 tp: 8 @@ -6040,7 +6040,7 @@ dsr1-fp4-b200-dynamo-sglang: search-space: # Non-MTP configurations - conc-list: [16, 128] - recipe: "dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml:zip_override_stp_lowlat[0]" + recipe: "dsr1/sglang/b200-fp4/1k1k/disagg/1k1k.yaml:zip_override_stp_lowlat[0]" prefill: num-worker: 1 tp: 4 @@ -6052,7 +6052,7 @@ dsr1-fp4-b200-dynamo-sglang: ep: 8 dp-attn: false - conc-list: [32, 64, 256] - recipe: "dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml:zip_override_stp_lowlat[1]" + recipe: "dsr1/sglang/b200-fp4/1k1k/disagg/1k1k.yaml:zip_override_stp_lowlat[1]" prefill: num-worker: 1 tp: 4 @@ -6064,7 +6064,7 @@ dsr1-fp4-b200-dynamo-sglang: ep: 8 dp-attn: false - conc-list: [512] - recipe: "dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml:zip_override_stp_maxtpt[0]" + recipe: "dsr1/sglang/b200-fp4/1k1k/disagg/1k1k.yaml:zip_override_stp_maxtpt[0]" prefill: num-worker: 1 tp: 4 @@ -6076,7 +6076,7 @@ dsr1-fp4-b200-dynamo-sglang: ep: 8 dp-attn: true - conc-list: [512] - recipe: "dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml:zip_override_stp_maxtpt[1]" + recipe: "dsr1/sglang/b200-fp4/1k1k/disagg/1k1k.yaml:zip_override_stp_maxtpt[1]" prefill: num-worker: 1 tp: 4 @@ -6092,7 +6092,7 @@ dsr1-fp4-b200-dynamo-sglang: search-space: # Non-MTP configurations - conc-list: [64, 128] - recipe: "dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml:zip_override_stp_lowlat[0]" + recipe: "dsr1/sglang/b200-fp4/8k1k/disagg/8k1k.yaml:zip_override_stp_lowlat[0]" prefill: num-worker: 1 tp: 4 @@ -6104,7 +6104,7 @@ dsr1-fp4-b200-dynamo-sglang: ep: 8 dp-attn: false - conc-list: [8] - recipe: "dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml:zip_override_stp_lowlat[1]" + recipe: "dsr1/sglang/b200-fp4/8k1k/disagg/8k1k.yaml:zip_override_stp_lowlat[1]" prefill: num-worker: 1 tp: 4 @@ -6116,7 +6116,7 @@ dsr1-fp4-b200-dynamo-sglang: ep: 8 dp-attn: false - conc-list: [4, 128] - recipe: "dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml:zip_override_stp_lowlat[2]" + recipe: "dsr1/sglang/b200-fp4/8k1k/disagg/8k1k.yaml:zip_override_stp_lowlat[2]" prefill: num-worker: 2 tp: 4 @@ -6128,7 +6128,7 @@ dsr1-fp4-b200-dynamo-sglang: ep: 8 dp-attn: false - conc-list: [4, 8, 16, 64] - recipe: "dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml:override_stp_tp4" + recipe: "dsr1/sglang/b200-fp4/8k1k/disagg/8k1k.yaml:override_stp_tp4" prefill: num-worker: 1 tp: 4 @@ -6140,7 +6140,7 @@ dsr1-fp4-b200-dynamo-sglang: ep: 1 dp-attn: false - conc-list: [1024, 2048] - recipe: "dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml:override_stp_maxtpt_7p2d" + recipe: "dsr1/sglang/b200-fp4/8k1k/disagg/8k1k.yaml:override_stp_maxtpt_7p2d" prefill: num-worker: 7 tp: 4 @@ -6167,7 +6167,7 @@ dsr1-fp8-b200-dynamo-sglang: search-space: # Non-MTP configurations - conc-list: [4] - recipe: "dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml:zip_override_stp_lowlat[0]" + recipe: "dsr1/sglang/b200-fp8/1k1k/disagg/1k1k.yaml:zip_override_stp_lowlat[0]" prefill: num-worker: 1 tp: 8 @@ -6179,7 +6179,7 @@ dsr1-fp8-b200-dynamo-sglang: ep: 8 dp-attn: false - conc-list: [16, 32, 64, 128, 256] - recipe: "dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml:zip_override_stp_lowlat[1]" + recipe: "dsr1/sglang/b200-fp8/1k1k/disagg/1k1k.yaml:zip_override_stp_lowlat[1]" prefill: num-worker: 1 tp: 8 @@ -6191,7 +6191,7 @@ dsr1-fp8-b200-dynamo-sglang: ep: 8 dp-attn: false - conc-list: [1024, 2048, 4096] - recipe: "dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml:zip_override_stp_maxtpt[0]" + recipe: "dsr1/sglang/b200-fp8/1k1k/disagg/1k1k.yaml:zip_override_stp_maxtpt[0]" prefill: num-worker: 1 tp: 8 @@ -6203,7 +6203,7 @@ dsr1-fp8-b200-dynamo-sglang: ep: 8 dp-attn: true - conc-list: [2048, 4096] - recipe: "dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml:zip_override_stp_maxtpt[1]" + recipe: "dsr1/sglang/b200-fp8/1k1k/disagg/1k1k.yaml:zip_override_stp_maxtpt[1]" prefill: num-worker: 2 tp: 8 @@ -6219,7 +6219,7 @@ dsr1-fp8-b200-dynamo-sglang: search-space: # STP low-latency: resolved from 8k1k.yaml zip_override_stp_lowlat - conc-list: [128] - recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_lowlat_0.yaml" + recipe: "dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml" prefill: num-worker: 1 tp: 8 @@ -6231,7 +6231,7 @@ dsr1-fp8-b200-dynamo-sglang: ep: 1 dp-attn: false - conc-list: [128] - recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_lowlat_1.yaml" + recipe: "dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml" prefill: num-worker: 1 tp: 8 @@ -6243,7 +6243,7 @@ dsr1-fp8-b200-dynamo-sglang: ep: 1 dp-attn: false - conc-list: [8, 16, 32, 64, 128] - recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_lowlat_2.yaml" + recipe: "dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml" prefill: num-worker: 1 tp: 8 @@ -6256,7 +6256,7 @@ dsr1-fp8-b200-dynamo-sglang: dp-attn: false # STP max-throughput: resolved from 8k1k.yaml zip_override_stp_maxtpt - conc-list: [288] - recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_0.yaml" + recipe: "dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_0.yaml" prefill: num-worker: 1 tp: 8 @@ -6268,7 +6268,7 @@ dsr1-fp8-b200-dynamo-sglang: ep: 8 dp-attn: true - conc-list: [160, 288] - recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_1.yaml" + recipe: "dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_1.yaml" prefill: num-worker: 1 tp: 8 @@ -6280,7 +6280,7 @@ dsr1-fp8-b200-dynamo-sglang: ep: 8 dp-attn: true - conc-list: [512] - recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_2.yaml" + recipe: "dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_2.yaml" prefill: num-worker: 2 tp: 8 @@ -6292,7 +6292,7 @@ dsr1-fp8-b200-dynamo-sglang: ep: 8 dp-attn: true - conc-list: [1024] - recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_3.yaml" + recipe: "dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_3.yaml" prefill: num-worker: 3 tp: 8 @@ -6320,7 +6320,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: # MTP low-latency: 1P1D - spec-decoding: "mtp" conc-list: [4, 64] - recipe: "dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml:zip_override_mtp_lowlat[0]" + recipe: "dsr1/sglang/b200-fp8/1k1k/disagg/1k1k.yaml:zip_override_mtp_lowlat[0]" prefill: num-worker: 1 tp: 8 @@ -6334,7 +6334,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: # MTP low-latency: 1P3D - spec-decoding: "mtp" conc-list: [4, 8, 16, 32, 128] - recipe: "dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml:zip_override_mtp_lowlat[1]" + recipe: "dsr1/sglang/b200-fp8/1k1k/disagg/1k1k.yaml:zip_override_mtp_lowlat[1]" prefill: num-worker: 1 tp: 8 @@ -6348,7 +6348,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: # MTP max-tpt: 1P5D - spec-decoding: "mtp" conc-list: [512, 4096] - recipe: "dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml:zip_override_mtp_maxtpt[1]" + recipe: "dsr1/sglang/b200-fp8/1k1k/disagg/1k1k.yaml:zip_override_mtp_maxtpt[1]" prefill: num-worker: 1 tp: 8 @@ -6362,7 +6362,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: # MTP max-tpt: 2P5D - spec-decoding: "mtp" conc-list: [1024, 2048, 4096] - recipe: "dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml:zip_override_mtp_maxtpt[2]" + recipe: "dsr1/sglang/b200-fp8/1k1k/disagg/1k1k.yaml:zip_override_mtp_maxtpt[2]" prefill: num-worker: 2 tp: 8 @@ -6376,7 +6376,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: # MTP max-tpt: 1P2D - spec-decoding: "mtp" conc-list: [512, 1024, 2048] - recipe: "dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml:override_mtp_maxtpt_1p2d" + recipe: "dsr1/sglang/b200-fp8/1k1k/disagg/1k1k.yaml:override_mtp_maxtpt_1p2d" prefill: num-worker: 1 tp: 8 @@ -6393,7 +6393,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: # MTP low-latency: resolved from 8k1k.yaml zip_override_mtp_lowlat - spec-decoding: "mtp" conc-list: [128] - recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml" + recipe: "dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml" prefill: num-worker: 1 tp: 8 @@ -6406,7 +6406,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: dp-attn: false - spec-decoding: "mtp" conc-list: [128] - recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml" + recipe: "dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml" prefill: num-worker: 1 tp: 8 @@ -6419,7 +6419,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: dp-attn: false - spec-decoding: "mtp" conc-list: [8, 16, 32, 64, 128] - recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml" + recipe: "dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml" prefill: num-worker: 1 tp: 8 @@ -6433,7 +6433,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: # MTP max-throughput: resolved from 8k1k.yaml zip_override_mtp_maxtpt - spec-decoding: "mtp" conc-list: [288] - recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_0.yaml" + recipe: "dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_0.yaml" prefill: num-worker: 1 tp: 8 @@ -6446,7 +6446,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: dp-attn: true - spec-decoding: "mtp" conc-list: [160, 288] - recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_1.yaml" + recipe: "dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_1.yaml" prefill: num-worker: 1 tp: 8 @@ -6459,7 +6459,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: dp-attn: true - spec-decoding: "mtp" conc-list: [512] - recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_2.yaml" + recipe: "dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_2.yaml" prefill: num-worker: 2 tp: 8 @@ -6472,7 +6472,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: dp-attn: true - spec-decoding: "mtp" conc-list: [1024] - recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_3.yaml" + recipe: "dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_3.yaml" prefill: num-worker: 3 tp: 8 @@ -6499,7 +6499,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp: search-space: - spec-decoding: "mtp" conc-list: [16, 512] - recipe: "dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml:zip_override_mtp_lowlat[0]" + recipe: "dsr1/sglang/b200-fp4/1k1k/disagg/1k1k.yaml:zip_override_mtp_lowlat[0]" prefill: num-worker: 1 tp: 4 @@ -6512,7 +6512,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp: dp-attn: false - spec-decoding: "mtp" conc-list: [32, 64, 256, 512] - recipe: "dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml:zip_override_mtp_lowlat[1]" + recipe: "dsr1/sglang/b200-fp4/1k1k/disagg/1k1k.yaml:zip_override_mtp_lowlat[1]" prefill: num-worker: 1 tp: 4 @@ -6525,7 +6525,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp: dp-attn: false - spec-decoding: "mtp" conc-list: [512, 1024] - recipe: "dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml:zip_override_mtp_maxtpt[0]" + recipe: "dsr1/sglang/b200-fp4/1k1k/disagg/1k1k.yaml:zip_override_mtp_maxtpt[0]" prefill: num-worker: 1 tp: 4 @@ -6538,7 +6538,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp: dp-attn: true - spec-decoding: "mtp" conc-list: [512] - recipe: "dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml:zip_override_mtp_maxtpt[1]" + recipe: "dsr1/sglang/b200-fp4/1k1k/disagg/1k1k.yaml:zip_override_mtp_maxtpt[1]" prefill: num-worker: 1 tp: 4 @@ -6557,7 +6557,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp: search-space: - spec-decoding: "mtp" conc-list: [64, 128] - recipe: "dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml:zip_override_mtp_lowlat[0]" + recipe: "dsr1/sglang/b200-fp4/8k1k/disagg/8k1k.yaml:zip_override_mtp_lowlat[0]" prefill: num-worker: 1 tp: 4 @@ -6570,7 +6570,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp: dp-attn: false - spec-decoding: "mtp" conc-list: [8] - recipe: "dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml:zip_override_mtp_lowlat[1]" + recipe: "dsr1/sglang/b200-fp4/8k1k/disagg/8k1k.yaml:zip_override_mtp_lowlat[1]" prefill: num-worker: 1 tp: 4 @@ -6583,7 +6583,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp: dp-attn: false - spec-decoding: "mtp" conc-list: [4, 128] - recipe: "dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml:zip_override_mtp_lowlat[2]" + recipe: "dsr1/sglang/b200-fp4/8k1k/disagg/8k1k.yaml:zip_override_mtp_lowlat[2]" prefill: num-worker: 2 tp: 4 @@ -6596,7 +6596,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp: dp-attn: false - spec-decoding: "mtp" conc-list: [4, 8, 16, 64] - recipe: "dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml:override_mtp_tp4" + recipe: "dsr1/sglang/b200-fp4/8k1k/disagg/8k1k.yaml:override_mtp_tp4" prefill: num-worker: 1 tp: 4 @@ -6623,7 +6623,7 @@ kimik2.5-fp4-gb200-dynamo-trt: search-space: # Non-MTP configurations (default spec_decoding="none") - conc-list: [ 4, 192, 360, 668 ] - recipe: "kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml" + recipe: "kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -6635,7 +6635,7 @@ kimik2.5-fp4-gb200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [ 5, 15, 30, 55 ] - recipe: "kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml" + recipe: "kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -6647,7 +6647,7 @@ kimik2.5-fp4-gb200-dynamo-trt: ep: 4 dp-attn: false - conc-list: [ 666 ] - recipe: "kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml" + recipe: "kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -6659,7 +6659,7 @@ kimik2.5-fp4-gb200-dynamo-trt: ep: 16 dp-attn: true - conc-list: [ 2253 ] - recipe: "kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml" + recipe: "kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -6671,7 +6671,7 @@ kimik2.5-fp4-gb200-dynamo-trt: ep: 32 dp-attn: true - conc-list: [ 4301, 6452 ] - recipe: "kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml" + recipe: "kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -6683,7 +6683,7 @@ kimik2.5-fp4-gb200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [ 4301 ] - recipe: "kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml" + recipe: "kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 4 @@ -6695,7 +6695,7 @@ kimik2.5-fp4-gb200-dynamo-trt: ep: 16 dp-attn: true - conc-list: [ 4301 ] - recipe: "kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml" + recipe: "kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 4 @@ -6712,7 +6712,7 @@ kimik2.5-fp4-gb200-dynamo-trt: search-space: # Non-MTP configurations (default spec_decoding="none") - conc-list: [ 4 ] - recipe: "kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml" + recipe: "kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -6724,7 +6724,7 @@ kimik2.5-fp4-gb200-dynamo-trt: ep: 8 dp-attn: false - conc-list: [ 156 ] - recipe: "kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml" + recipe: "kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -6736,7 +6736,7 @@ kimik2.5-fp4-gb200-dynamo-trt: ep: 4 dp-attn: false - conc-list: [ 5, 15, 30, 60, 105 ] - recipe: "kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml" + recipe: "kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml" prefill: num-worker: 1 tp: 4 @@ -6748,7 +6748,7 @@ kimik2.5-fp4-gb200-dynamo-trt: ep: 4 dp-attn: false - conc-list: [ 333 ] - recipe: "kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml" + recipe: "kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml" prefill: num-worker: 2 tp: 4 @@ -6760,7 +6760,7 @@ kimik2.5-fp4-gb200-dynamo-trt: ep: 16 dp-attn: true - conc-list: [ 615 ] - recipe: "kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml" + recipe: "kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml" prefill: num-worker: 3 tp: 4 @@ -6772,7 +6772,7 @@ kimik2.5-fp4-gb200-dynamo-trt: ep: 16 dp-attn: true - conc-list: [ 2151 ] - recipe: "kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml" + recipe: "kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml" prefill: num-worker: 5 tp: 4 @@ -6784,7 +6784,7 @@ kimik2.5-fp4-gb200-dynamo-trt: ep: 8 dp-attn: true - conc-list: [ 2253 ] - recipe: "kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml" + recipe: "kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml" prefill: num-worker: 7 tp: 4 @@ -6810,7 +6810,7 @@ kimik2.5-fp4-gb200-dynamo-vllm: osl: 1024 search-space: - conc-list: [256, 512, 1024, 2048, 3072, 4096] - recipe: "kimik2.5/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p1d-dep4-dep16.yaml" + recipe: "kimik2.5/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep4-dep16.yaml" prefill: num-worker: 1 tp: 4 @@ -6822,7 +6822,7 @@ kimik2.5-fp4-gb200-dynamo-vllm: ep: 16 dp-attn: true - conc-list: [4, 8, 16, 32, 64, 128] - recipe: "kimik2.5/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml" + recipe: "kimik2.5/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml" prefill: num-worker: 1 tp: 4 @@ -6837,7 +6837,7 @@ kimik2.5-fp4-gb200-dynamo-vllm: osl: 1024 search-space: - conc-list: [4, 8, 16, 32, 128] - recipe: "kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml" + recipe: "kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml" prefill: num-worker: 1 tp: 4 @@ -6849,7 +6849,7 @@ kimik2.5-fp4-gb200-dynamo-vllm: ep: 4 dp-attn: false - conc-list: [512, 1024] - recipe: "kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-3p1d-dep4-dep16.yaml" + recipe: "kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-3p1d-dep4-dep16.yaml" prefill: num-worker: 3 tp: 4 @@ -6861,7 +6861,7 @@ kimik2.5-fp4-gb200-dynamo-vllm: ep: 16 dp-attn: true - conc-list: [2048] - recipe: "kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-5p1d-dep4-dep8.yaml" + recipe: "kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-5p1d-dep4-dep8.yaml" prefill: num-worker: 5 tp: 4 @@ -6873,7 +6873,7 @@ kimik2.5-fp4-gb200-dynamo-vllm: ep: 8 dp-attn: true - conc-list: [3072, 4096] - recipe: "kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-6p1d-dep4-dep16.yaml" + recipe: "kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-6p1d-dep4-dep16.yaml" prefill: num-worker: 6 tp: 4 @@ -6905,7 +6905,7 @@ dsv4-fp4-gb200-dynamo-vllm: # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch # 1p1d-dep8-tep8.yaml (offload + numa-bind stripped — see recipe header). - conc-list: [1, 4, 8, 16, 32, 64] - recipe: "dsv4/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml" + recipe: "dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml" prefill: num-worker: 1 tp: 8 @@ -6919,7 +6919,7 @@ dsv4-fp4-gb200-dynamo-vllm: # Mid throughput: 1 prefill (DP=8) + 1 wide decode (DP=16). # 6 nodes. Single prefill is plenty for 1k prompts up to ~conc 4096. - conc-list: [128, 256, 1024, 2048, 4096] - recipe: "dsv4/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p1d-dep8-dep16.yaml" + recipe: "dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep8-dep16.yaml" prefill: num-worker: 1 tp: 8 @@ -6934,7 +6934,7 @@ dsv4-fp4-gb200-dynamo-vllm: # The 4096 overlap with the 1p1d block gives a crossover point. 8192 # would saturate 1p1d's prefill, so this topology takes over there. - conc-list: [4096, 8192] - recipe: "dsv4/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml" + recipe: "dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml" prefill: num-worker: 3 tp: 8 @@ -6952,7 +6952,7 @@ dsv4-fp4-gb200-dynamo-vllm: # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8). # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch. - conc-list: [1, 4, 8, 16, 32, 64] - recipe: "dsv4/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml" + recipe: "dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml" prefill: num-worker: 1 tp: 8 @@ -6965,7 +6965,7 @@ dsv4-fp4-gb200-dynamo-vllm: dp-attn: false # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total. - conc-list: [512, 1024] - recipe: "dsv4/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml" + recipe: "dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml" prefill: num-worker: 3 tp: 8 @@ -6979,7 +6979,7 @@ dsv4-fp4-gb200-dynamo-vllm: # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes # (full cluster). Mirrors NVIDIA/srt-slurm PR #67. - conc-list: [4096, 8192] - recipe: "dsv4/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-7p1d-dep8-dep16.yaml" + recipe: "dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-7p1d-dep8-dep16.yaml" prefill: num-worker: 7 tp: 8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp4/1k1k/disagg/1k1k.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp4/1k1k/disagg/1k1k.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp4/8k1k/disagg/8k1k.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp4/8k1k/disagg/8k1k.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/1k1k/disagg/1k1k.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/1k1k/disagg/1k1k.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_2.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_2.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_2.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_lowlat_0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_lowlat_1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_lowlat_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_lowlat_2.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_2.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_2.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_2.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k/1k/disagg/stp/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/low-latency.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k/1k/disagg/stp/low-latency.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/low-latency.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k/1k/disagg/stp/max-tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/max-tpt.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k/1k/disagg/stp/max-tpt.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/max-tpt.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k/1k/disagg/stp/mid-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/mid-curve.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k/1k/disagg/stp/mid-curve.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/mid-curve.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k/1k/disagg/stp/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k1k/disagg/stp/low-latency.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k/1k/disagg/stp/low-latency.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k1k/disagg/stp/low-latency.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k/1k/disagg/stp/max-tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k1k/disagg/stp/max-tpt.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k/1k/disagg/stp/max-tpt.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k1k/disagg/stp/max-tpt.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k/1k/disagg/stp/mid-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k1k/disagg/stp/mid-curve.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k/1k/disagg/stp/mid-curve.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k1k/disagg/stp/mid-curve.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/low-latency.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/low-latency.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/low-latency.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/max-tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/max-tpt.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/max-tpt.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/max-tpt.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/mid-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/mid-curve.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/mid-curve.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/mid-curve.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/ultra-tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/ultra-tpt.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/ultra-tpt.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/ultra-tpt.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k/1k/disagg/stp/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k1k/disagg/stp/low-latency.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k/1k/disagg/stp/low-latency.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k1k/disagg/stp/low-latency.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k/1k/disagg/stp/max_tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k1k/disagg/stp/max_tpt.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k/1k/disagg/stp/max_tpt.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k1k/disagg/stp/max_tpt.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k/1k/disagg/stp/mid-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k1k/disagg/stp/mid-curve.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k/1k/disagg/stp/mid-curve.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k1k/disagg/stp/mid-curve.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k/1k/disagg/stp/low_latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k1k/disagg/stp/low_latency.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k/1k/disagg/stp/low_latency.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k1k/disagg/stp/low_latency.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k/1k/disagg/stp/max_tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k1k/disagg/stp/max_tpt.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k/1k/disagg/stp/max_tpt.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k1k/disagg/stp/max_tpt.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k/1k/disagg/stp/mid_curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k1k/disagg/stp/mid_curve.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k/1k/disagg/stp/mid_curve.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k1k/disagg/stp/mid_curve.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k/1k/disagg/stp/low_latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k1k/disagg/stp/low_latency.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k/1k/disagg/stp/low_latency.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k1k/disagg/stp/low_latency.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k/1k/disagg/stp/max_tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k1k/disagg/stp/max_tpt.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k/1k/disagg/stp/max_tpt.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k1k/disagg/stp/max_tpt.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k/1k/disagg/stp/mid_curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k1k/disagg/stp/mid_curve.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k/1k/disagg/stp/mid_curve.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k1k/disagg/stp/mid_curve.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k/1k/disagg/stp/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k1k/disagg/stp/low-latency.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k/1k/disagg/stp/low-latency.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k1k/disagg/stp/low-latency.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k/1k/disagg/stp/max.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k1k/disagg/stp/max.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k/1k/disagg/stp/max.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k1k/disagg/stp/max.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k/1k/disagg/stp/mid.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k1k/disagg/stp/mid.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k/1k/disagg/stp/mid.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k1k/disagg/stp/mid.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k/1k/disagg/stp/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k1k/disagg/stp/low-latency.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k/1k/disagg/stp/low-latency.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k1k/disagg/stp/low-latency.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k/1k/disagg/stp/max.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k1k/disagg/stp/max.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k/1k/disagg/stp/max.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k1k/disagg/stp/max.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k/1k/disagg/stp/mid.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k1k/disagg/stp/mid.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k/1k/disagg/stp/mid.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k1k/disagg/stp/mid.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k/1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k/1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k/1k/disagg/mtp/h100-fp8-1p2d-max-tp-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/mtp/h100-fp8-1p2d-max-tp-mtp.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k/1k/disagg/mtp/h100-fp8-1p2d-max-tp-mtp.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/mtp/h100-fp8-1p2d-max-tp-mtp.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k/1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k/1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k/1k/disagg/stp/h100-fp8-1p2d-max-tp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/stp/h100-fp8-1p2d-max-tp.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k/1k/disagg/stp/h100-fp8-1p2d-max-tp.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/stp/h100-fp8-1p2d-max-tp.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k/1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k/1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k/1k/disagg/mtp/h100-fp8-1p1d-max-tp-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/mtp/h100-fp8-1p1d-max-tp-mtp.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k/1k/disagg/mtp/h100-fp8-1p1d-max-tp-mtp.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/mtp/h100-fp8-1p1d-max-tp-mtp.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k/1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k/1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k/1k/disagg/stp/h100-fp8-1p1d-max-tp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/stp/h100-fp8-1p1d-max-tp.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k/1k/disagg/stp/h100-fp8-1p1d-max-tp.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/stp/h100-fp8-1p1d-max-tp.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/mtp/bs256-1p6d-dep-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/mtp/bs256-1p6d-dep-mtp.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/mtp/bs256-1p6d-dep-mtp.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/mtp/bs256-1p6d-dep-mtp.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/mtp/bs256-1p6d-tp-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/mtp/bs256-1p6d-tp-mtp.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/mtp/bs256-1p6d-tp-mtp.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/mtp/bs256-1p6d-tp-mtp.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/mtp/low-latency-1p9d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/mtp/low-latency-1p9d-mtp.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/mtp/low-latency-1p9d-mtp.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/mtp/low-latency-1p9d-mtp.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/stp/bs256-1p6d-dep.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/stp/bs256-1p6d-dep.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/stp/bs256-1p6d-dep.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/stp/bs256-1p6d-dep.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/stp/bs256-1p6d-tp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/stp/bs256-1p6d-tp.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/stp/bs256-1p6d-tp.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/stp/bs256-1p6d-tp.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/stp/low-latency-1p9d.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/stp/low-latency-1p9d.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/stp/low-latency-1p9d.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/stp/low-latency-1p9d.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs128-1p1d-dep-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs128-1p1d-dep-mtp.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs128-1p1d-dep-mtp.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs128-1p1d-dep-mtp.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs16-1p3d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs16-1p3d-mtp.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs16-1p3d-mtp.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs16-1p3d-mtp.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs4-1p7d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs4-1p7d-mtp.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs4-1p7d-mtp.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs4-1p7d-mtp.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs64-2p3d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs64-2p3d-mtp.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs64-2p3d-mtp.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs64-2p3d-mtp.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs8-1p6d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs8-1p6d-mtp.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs8-1p6d-mtp.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs8-1p6d-mtp.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs128-1p1d-dep.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs128-1p1d-dep.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs128-1p1d-dep.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs128-1p1d-dep.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs16-1p3d.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs16-1p3d.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs16-1p3d.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs16-1p3d.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs4-1p7d.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs4-1p7d.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs4-1p7d.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs4-1p7d.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs64-2p3d.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs64-2p3d.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs64-2p3d.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs64-2p3d.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs8-1p6d.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs8-1p6d.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs8-1p6d.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs8-1p6d.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep8-dep16.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p1d-dep8-dep16.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep8-dep16.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-7p1d-dep8-dep16.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-7p1d-dep8-dep16.yaml rename to benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-7p1d-dep8-dep16.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p1d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep4-dep16.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p1d-dep4-dep16.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep4-dep16.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-3p1d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-3p1d-dep4-dep16.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-3p1d-dep4-dep16.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-3p1d-dep4-dep16.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-5p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-5p1d-dep4-dep8.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-5p1d-dep4-dep8.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-5p1d-dep4-dep8.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-6p1d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-6p1d-dep4-dep16.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-6p1d-dep4-dep16.yaml rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-6p1d-dep4-dep16.yaml From 1ca06965a86a3cddfd142ab1232207134bfa970c Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 28 Apr 2026 14:09:32 -0500 Subject: [PATCH 06/16] runners: pin all srt-slurm clones to NVIDIA/srt-slurm@52e697d5 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop the ishandhanani/srt-slurm@sa-submission-q1-2026 fallback in launch_gb200-nv.sh — every launcher now clones NVIDIA/srt-slurm at the pinned commit 52e697d (nginx fd-limit fix on origin/main, Apr 2026). Pinning to a SHA instead of a moving branch keeps benchmark runs reproducible across upstream churn. Rename the helper's SRT_BRANCH env var to SRT_REF for accuracy (it accepts any git ref, not just a branch). Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/benchmark_lib.sh | 10 ++++++---- runners/launch_gb200-nv.sh | 10 +--------- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 92998de27..01ed0657c 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -887,22 +887,24 @@ sanitize_image_filename() { # # All inputs are env vars (set before calling); all are optional: # SRT_REPO_URL default https://github.com/NVIDIA/srt-slurm.git -# SRT_BRANCH default sa-submission-q2-2026 +# SRT_REF default pinned commit SHA on NVIDIA/srt-slurm; accepts +# any git ref (branch / tag / SHA). Pinning to a SHA keeps +# benchmark runs reproducible across srt-slurm churn. # SRT_REPO_DIR default srt-slurm (relative to current cwd) # UV_INSTALL_DIR default $HOME/.local/bin (uv's own default) # UV_VENV_DIR default .venv (inside the cloned repo) clone_and_install_srtctl() { local repo_url="${SRT_REPO_URL:-https://github.com/NVIDIA/srt-slurm.git}" - local branch="${SRT_BRANCH:-sa-submission-q2-2026}" + local ref="${SRT_REF:-52e697d595569b1055b3bb436e06408a6f078293}" local repo_dir="${SRT_REPO_DIR:-srt-slurm}" local uv_install_dir="${UV_INSTALL_DIR:-${HOME}/.local/bin}" local uv_venv_dir="${UV_VENV_DIR:-.venv}" - echo "Cloning ${repo_url}@${branch} into ${repo_dir}..." + echo "Cloning ${repo_url}@${ref} into ${repo_dir}..." rm -rf "$repo_dir" git clone "$repo_url" "$repo_dir" cd "$repo_dir" || return 1 - git checkout "$branch" + git checkout "$ref" echo "Installing uv + srtctl into venv at ${uv_venv_dir}..." export UV_INSTALL_DIR="$uv_install_dir" diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index e9c3e62b8..c8c822c6f 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -139,15 +139,7 @@ fi # We only clone srt-slurm to install srtctl + pick up its sibling configs # (configs/, expert-distributions/, etc). The recipe itself is supplied as an # absolute CONFIG_FILE pointing at benchmarks/multi_node/srt-slurm-recipes/. -if [[ $FRAMEWORK == "dynamo-vllm" || ( $FRAMEWORK == "dynamo-trt" && $MODEL_PREFIX == "kimik2.5" ) ]]; then - SRT_REPO_URL=https://github.com/NVIDIA/srt-slurm.git - SRT_BRANCH=sa-submission-q2-2026 -else - SRT_REPO_URL=https://github.com/ishandhanani/srt-slurm.git - SRT_BRANCH=sa-submission-q1-2026 -fi -SRT_REPO_URL="$SRT_REPO_URL" SRT_BRANCH="$SRT_BRANCH" \ - clone_and_install_srtctl || exit 1 +clone_and_install_srtctl || exit 1 echo "Configs available at: $SRT_REPO_DIR/" From 0f755d22ad70b90d2b1972f752880e64c9c77262 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 28 Apr 2026 14:11:24 -0500 Subject: [PATCH 07/16] runners: hardcode srt-slurm pin in benchmark_lib helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop env-var override for SRT_REPO_URL / SRT_REF — every benchmark run must use the same pinned srtctl, no ad-hoc overrides at the call site. Bumping the pin is now a one-line edit to benchmark_lib.sh. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/benchmark_lib.sh | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 01ed0657c..ec2d5a4f1 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -885,17 +885,16 @@ sanitize_image_filename() { # successfully, cwd is the cloned repo and the venv is active. Idempotent on # uv: skips re-curl if the binary is already present at $UV_INSTALL_DIR. # -# All inputs are env vars (set before calling); all are optional: -# SRT_REPO_URL default https://github.com/NVIDIA/srt-slurm.git -# SRT_REF default pinned commit SHA on NVIDIA/srt-slurm; accepts -# any git ref (branch / tag / SHA). Pinning to a SHA keeps -# benchmark runs reproducible across srt-slurm churn. +# The srt-slurm commit is pinned (not env-var overridable) so every benchmark +# run uses the exact same srtctl. To bump it, edit the `ref=` line below. +# +# All other inputs are env vars (set before calling); all are optional: # SRT_REPO_DIR default srt-slurm (relative to current cwd) # UV_INSTALL_DIR default $HOME/.local/bin (uv's own default) # UV_VENV_DIR default .venv (inside the cloned repo) clone_and_install_srtctl() { - local repo_url="${SRT_REPO_URL:-https://github.com/NVIDIA/srt-slurm.git}" - local ref="${SRT_REF:-52e697d595569b1055b3bb436e06408a6f078293}" + local repo_url="https://github.com/NVIDIA/srt-slurm.git" + local ref="52e697d595569b1055b3bb436e06408a6f078293" local repo_dir="${SRT_REPO_DIR:-srt-slurm}" local uv_install_dir="${UV_INSTALL_DIR:-${HOME}/.local/bin}" local uv_venv_dir="${UV_VENV_DIR:-.venv}" From 6f99d485e39f5b9d6405e78cb5a9940237ea3459 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 28 Apr 2026 14:35:04 -0500 Subject: [PATCH 08/16] srt-slurm: wire custom-script bench, drop sa-bench dependency (proof-of-life) Stop relying on srt-slurm's bundled `benchmark.type: sa-bench` (which ships its own copy of bench_serving.py inside the upstream repo) and instead use `benchmark.type: custom` to run *this* repo's utils/bench_serving against the already-ready frontend. Avoids dual-maintaining the bench client. Plumbing: - benchmarks/multi_node/srt_bench.sh: thin wrapper that mirrors sa-bench's per-conc warmup-then-bench loop, writes results to the same /logs/sa-bench_isl__osl_/results_concurrency__gpus__ctx_

_gen_.json layout the launcher result-harvesters already grep, with conc list parsed from x-separated env (e.g. "128x256x1024"). - Recipe shape: add `container_mounts: { $INFMAX_WORKSPACE: /infmax-workspace }` + replace `benchmark: { type: sa-bench, ... }` with `benchmark: { type: custom, command: "bash /infmax-workspace/...", env: {...} }`. Migrated as proof-of-life: - dsr1/trtllm/b200-fp4/1k1k mtp ctx1_gen2_dep8_batch64_eplb0_mtp2 (TRT-LLM) - dsr1/sglang/gb200-fp4/1k1k stp low-latency (SGLang) - dsv4/vllm/gb200-fp4/1k1k stp disagg-gb200-1p1d-dep8-tep8 (vLLM) Remaining ~360 recipes still use sa-bench; they migrate in a follow-up once this triplet runs end-to-end on a real cluster. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/CONFIGS.md | 28 ++++ .../1k1k/disagg/stp/low-latency.yaml | 25 ++- .../ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml | 25 ++- .../stp/disagg-gb200-1p1d-dep8-tep8.yaml | 27 +++- benchmarks/multi_node/srt_bench.sh | 152 ++++++++++++++++++ 5 files changed, 241 insertions(+), 16 deletions(-) create mode 100755 benchmarks/multi_node/srt_bench.sh diff --git a/.github/configs/CONFIGS.md b/.github/configs/CONFIGS.md index 46755ef31..eb6841b0c 100644 --- a/.github/configs/CONFIGS.md +++ b/.github/configs/CONFIGS.md @@ -73,6 +73,34 @@ search-space: - `recipe` is optional: multi-node entries that do *not* go through srt-slurm (e.g. dynamo-sglang aggregated topologies that drive their own bash) leave it unset. - Recipes live under `benchmarks/multi_node/srt-slurm-recipes/` organized as `//-////.yaml` — e.g. `dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml`. A handful of sglang-style files that carry override sections spanning both stp and mtp are parked one level shallower (the trailing `/` segment is omitted). The benchmark template resolves `recipe` to an absolute path and passes it to the launcher as `CONFIG_FILE`, so launchers do not see the relative form. +### Custom-script benchmarking + +Recipes are migrating from srt-slurm's bundled `benchmark.type: sa-bench` to `benchmark.type: custom` so the benchmark client lives in this repo (`utils/bench_serving/benchmark_serving.py`) instead of being maintained twice. New shape: + +```yaml +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + +benchmark: + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-r1-fp4" # served-model-name advertised by the engine + ISL: "1024" + OSL: "1024" + CONCURRENCIES: "128x256x1024" # x-separated, looped inside srt_bench.sh + REQ_RATE: "inf" + IS_DISAGGREGATED: "true" + PREFILL_GPUS: "4" # per prefill worker + DECODE_GPUS: "8" # per decode worker + TOTAL_GPUS: "20" # sum across all workers + USE_CHAT_TEMPLATE: "false" # optional, defaults to true +``` + +`benchmarks/multi_node/srt_bench.sh` is a thin wrapper around `utils/bench_serving/benchmark_serving.py` that mirrors sa-bench's per-conc warmup-then-bench loop and writes results to `/logs/sa-bench_isl__osl_/results_concurrency__gpus__ctx_

_gen_.json` so the existing launcher result-harvester picks them up unchanged. See the script's header for the full env-var contract. + +The `container_mounts` block bind-mounts the host-side `$INFMAX_WORKSPACE` (set by the launcher to `$GITHUB_WORKSPACE`) at `/infmax-workspace` inside srt-slurm's benchmark container, so the wrapper and bench client are reachable at known paths. `srtctl` resolves `$INFMAX_WORKSPACE` via `os.path.expandvars` at submission time. + ## Runners The `runners.yaml` config represents the available runners in the repository. The keys are the runner *types* (i.e., the GPUs as well as some specific combinations like `b200-trt`) whereas the value is a list of *runner nodes*. This config is used to verify the master configs. diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/low-latency.yaml index 8729aa6fd..2f5deea27 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/low-latency.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/low-latency.yaml @@ -108,9 +108,24 @@ backend: tensor-parallel-size: 4 expert-parallel-size: 1 +# InferenceX bench-serving wrapper, invoked via srt-slurm `benchmark.type: custom`. +# srt_bench.sh lives at /infmax-workspace/benchmarks/multi_node/srt_bench.sh +# inside the benchmark container; the host-side workspace is bind-mounted via +# the container_mounts block below. See benchmarks/multi_node/srt_bench.sh for +# the full env-var contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4x8x32" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + ISL: "1024" + OSL: "1024" + CONCURRENCIES: "4x8x32" + REQ_RATE: "inf" + IS_DISAGGREGATED: "true" + PREFILL_GPUS: "4" + DECODE_GPUS: "4" + TOTAL_GPUS: "12" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml index d4d9de835..3ca5ffd12 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml @@ -104,12 +104,27 @@ backend: decoding_type: MTP num_nextn_predict_layers: 2 +# InferenceX bench-serving wrapper, invoked via srt-slurm `benchmark.type: custom`. +# srt_bench.sh lives at /infmax-workspace/benchmarks/multi_node/srt_bench.sh +# inside the benchmark container; the host-side workspace is bind-mounted via +# the container_mounts block below. See benchmarks/multi_node/srt_bench.sh for +# the full env-var contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1214" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-r1-fp4" + ISL: "1024" + OSL: "1024" + CONCURRENCIES: "1214" + REQ_RATE: "inf" + IS_DISAGGREGATED: "true" + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "20" frontend: nginx_container: "nginx-sqsh" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml index 984c79526..77da875f6 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml @@ -135,10 +135,25 @@ backend: enable-sleep-mode: true tokenizer-mode: deepseek_v4 +# InferenceX bench-serving wrapper, invoked via srt-slurm `benchmark.type: custom`. +# srt_bench.sh lives at /infmax-workspace/benchmarks/multi_node/srt_bench.sh +# inside the benchmark container; the host-side workspace is bind-mounted via +# the container_mounts block below. See benchmarks/multi_node/srt_bench.sh for +# the full env-var contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1x4x8x16x32x64" - req_rate: "inf" - use_chat_template: false + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-V4-Pro" + ISL: "1024" + OSL: "1024" + CONCURRENCIES: "1x4x8x16x32x64" + REQ_RATE: "inf" + USE_CHAT_TEMPLATE: "false" + IS_DISAGGREGATED: "true" + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "16" diff --git a/benchmarks/multi_node/srt_bench.sh b/benchmarks/multi_node/srt_bench.sh new file mode 100755 index 000000000..418895c0f --- /dev/null +++ b/benchmarks/multi_node/srt_bench.sh @@ -0,0 +1,152 @@ +#!/usr/bin/env bash +# Drop-in replacement for srt-slurm's bundled `sa-bench` benchmark, wired to +# this repo's utils/bench_serving/benchmark_serving.py via srt-slurm's +# `benchmark.type: custom` feature. srt-slurm owns server bring-up; this +# script runs against the already-ready frontend on the head node, then +# writes one results JSON per concurrency to a path the launcher's +# result-harvester recognizes. +# +# Required env (set via `benchmark.env` in the recipe yaml): +# ISL OSL CONCURRENCIES MODEL_NAME +# IS_DISAGGREGATED TOTAL_GPUS PREFILL_GPUS DECODE_GPUS +# +# Optional env (defaults shown): +# PORT=8000 frontend port reachable at localhost +# REQ_RATE=inf +# RANDOM_RANGE_RATIO=0.8 +# NUM_PROMPTS_MULT=10 prompts per conc = NUM_PROMPTS_MULT * conc +# NUM_WARMUP_MULT=2 warmup prompts per conc = NUM_WARMUP_MULT * conc +# USE_CHAT_TEMPLATE=true +# CUSTOM_TOKENIZER= (empty: skip --custom-tokenizer) +# DATASET_NAME=random +# DATASET_PATH= (only used when DATASET_NAME != random) +# TOKENIZER_PATH=$MODEL_PATH (or container path; falls back to $MODEL_NAME) +# PORT_HEALTH_PATH=/v1/models +# +# The InferenceX repo is bind-mounted into the container at /infmax-workspace +# (configured by the recipe's `container_mounts` block). This script lives at +# /infmax-workspace/benchmarks/multi_node/srt_bench.sh and shells out to +# /infmax-workspace/utils/bench_serving/benchmark_serving.py. +set -euo pipefail + +INFMAX_WS="${INFMAX_CONTAINER_WORKSPACE:-/infmax-workspace}" + +require() { + for v in "$@"; do + if [[ -z "${!v:-}" ]]; then + echo "ERROR: required env var '$v' is unset" >&2 + exit 64 + fi + done +} +require ISL OSL CONCURRENCIES MODEL_NAME IS_DISAGGREGATED TOTAL_GPUS + +PORT="${PORT:-8000}" +REQ_RATE="${REQ_RATE:-inf}" +RANDOM_RANGE_RATIO="${RANDOM_RANGE_RATIO:-0.8}" +NUM_PROMPTS_MULT="${NUM_PROMPTS_MULT:-10}" +NUM_WARMUP_MULT="${NUM_WARMUP_MULT:-2}" +USE_CHAT_TEMPLATE="${USE_CHAT_TEMPLATE:-true}" +CUSTOM_TOKENIZER="${CUSTOM_TOKENIZER:-}" +DATASET_NAME="${DATASET_NAME:-random}" +DATASET_PATH="${DATASET_PATH:-}" +PREFILL_GPUS="${PREFILL_GPUS:-0}" +DECODE_GPUS="${DECODE_GPUS:-0}" + +ENDPOINT="http://localhost:${PORT}" +RESULT_DIR="/logs/sa-bench_isl_${ISL}_osl_${OSL}" +mkdir -p "$RESULT_DIR" + +BENCH_PY="${INFMAX_WS}/utils/bench_serving/benchmark_serving.py" +[[ -f "$BENCH_PY" ]] || { echo "ERROR: benchmark_serving.py not found at $BENCH_PY (mount $INFMAX_WS missing?)" >&2; exit 65; } + +# Bench-serving deps. The srt-slurm worker container ships most of these but +# not all (datasets in particular). Reuse system-site-packages so we don't +# rebuild what's already there. +ensure_deps() { + local deps=(aiohttp numpy pandas datasets Pillow tqdm transformers huggingface_hub) + if python3 -c "import aiohttp, numpy, pandas, datasets, PIL, tqdm, transformers, huggingface_hub" 2>/dev/null; then + return + fi + local venv="/tmp/srt-bench-venv" + [[ -d "$venv" ]] || python3 -m venv --system-site-packages "$venv" + # shellcheck disable=SC1091 + source "$venv/bin/activate" + pip install --quiet "${deps[@]}" +} +ensure_deps + +# Verify endpoint +echo "Verifying endpoint at $ENDPOINT ..." +curl -fsS "${ENDPOINT}/v1/models" >/dev/null || { + echo "ERROR: endpoint $ENDPOINT did not respond on /v1/models" >&2 + exit 66 +} + +ulimit -n 65536 2>/dev/null || true + +DATASET_ARGS=(--dataset-name "$DATASET_NAME") +[[ -n "$DATASET_PATH" ]] && DATASET_ARGS+=(--dataset-path "$DATASET_PATH") + +RANDOM_LEN_ARGS=() +if [[ "$DATASET_NAME" == "random" ]]; then + RANDOM_LEN_ARGS=( + --random-input-len "$ISL" + --random-output-len "$OSL" + --random-range-ratio "$RANDOM_RANGE_RATIO" + ) +fi + +CHAT_TEMPLATE_ARGS=() +[[ "$USE_CHAT_TEMPLATE" == "true" ]] && CHAT_TEMPLATE_ARGS+=(--use-chat-template) + +CUSTOM_TOKENIZER_ARGS=() +[[ -n "$CUSTOM_TOKENIZER" ]] && CUSTOM_TOKENIZER_ARGS+=(--custom-tokenizer "$CUSTOM_TOKENIZER") + +# `tokenizer` is required by benchmark_serving.py; pass MODEL_NAME by default +# (HF will fetch). Recipe can override via TOKENIZER_PATH for a local path. +TOKENIZER_PATH="${TOKENIZER_PATH:-$MODEL_NAME}" + +# Concurrency list is "x"-separated for parity with sa-bench. +IFS='x' read -r -a CONC_LIST <<< "$CONCURRENCIES" + +run_bench() { + local conc=$1 + local n_prompts=$2 + local request_rate=$3 + shift 3 + python3 -u "$BENCH_PY" \ + --model "$MODEL_NAME" --tokenizer "$TOKENIZER_PATH" \ + --host localhost --port "$PORT" \ + --backend dynamo --endpoint /v1/completions \ + --disable-tqdm \ + "${DATASET_ARGS[@]}" \ + --num-prompts "$n_prompts" \ + "${RANDOM_LEN_ARGS[@]}" \ + --ignore-eos \ + --request-rate "$request_rate" \ + --percentile-metrics ttft,tpot,itl,e2el \ + --max-concurrency "$conc" \ + --trust-remote-code \ + "${CHAT_TEMPLATE_ARGS[@]}" \ + "${CUSTOM_TOKENIZER_ARGS[@]}" \ + "$@" +} + +for conc in "${CONC_LIST[@]}"; do + echo "=== conc=$conc warmup ===" + run_bench "$conc" "$((conc * NUM_WARMUP_MULT))" 250 || true + + if [[ "$IS_DISAGGREGATED" == "true" ]]; then + result_filename="results_concurrency_${conc}_gpus_${TOTAL_GPUS}_ctx_${PREFILL_GPUS}_gen_${DECODE_GPUS}.json" + else + result_filename="results_concurrency_${conc}_gpus_${TOTAL_GPUS}.json" + fi + + echo "=== conc=$conc bench → $RESULT_DIR/$result_filename ===" + run_bench "$conc" "$((conc * NUM_PROMPTS_MULT))" "$REQ_RATE" \ + --result-dir "$RESULT_DIR" \ + --result-filename "$result_filename" +done + +echo "Done. Results in $RESULT_DIR." From 290fcb68749358131fad4416a8eb359c44f038de Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 28 Apr 2026 15:12:00 -0500 Subject: [PATCH 09/16] =?UTF-8?q?srt-slurm:=20simplify=20custom-bench=20pl?= =?UTF-8?q?umbing=20=E2=80=94=20drop=20redundant=20recipe=20env?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two pieces, one commit: 1. benchmark_lib.sh's run_benchmark_serving() gains optional pass-throughs for --tokenizer / --endpoint / --dataset-name / --dataset-path so the multi-node srt_bench.sh wrapper can reuse it instead of forking its own command-build. (--request-rate stays hardcoded "inf" — no recipe-level override.) ~50 lines of duplicated shell deleted from srt_bench.sh. 2. Recipe `benchmark.env` blocks lose every variable that is already exported by .github/workflows/benchmark-multinode-tmpl.yml at the workflow step level: MODEL, ISL, OSL, CONC_LIST, DISAGG, RANDOM_RANGE_RATIO. Those propagate down through srtctl → srun (default --export=ALL) → pyxis into the bench container, so srt_bench.sh reads them directly. Recipes now only carry per-recipe topology knobs (PREFILL_GPUS / DECODE_GPUS / TOTAL_GPUS — used in the result filename) plus the rare overrides. Tokenizer is hardcoded to /model — srtctl's RuntimeContext.create unconditionally bind-mounts the local model dir at that path in every container, so AutoTokenizer.from_pretrained("/model") loads from the same files the engine is serving. No HF Hub egress, works for HF-id and alias- only `model:` values alike, no `TOKENIZER_PATH` knob in recipes. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/CONFIGS.md | 20 +- benchmarks/benchmark_lib.sh | 37 +++- .../1k1k/disagg/stp/low-latency.yaml | 15 +- .../ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml | 20 +- .../stp/disagg-gb200-1p1d-dep8-tep8.yaml | 14 +- benchmarks/multi_node/srt_bench.sh | 192 ++++++++---------- 6 files changed, 146 insertions(+), 152 deletions(-) diff --git a/.github/configs/CONFIGS.md b/.github/configs/CONFIGS.md index eb6841b0c..302605fbb 100644 --- a/.github/configs/CONFIGS.md +++ b/.github/configs/CONFIGS.md @@ -85,19 +85,17 @@ benchmark: type: "custom" command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" env: - MODEL_NAME: "deepseek-r1-fp4" # served-model-name advertised by the engine - ISL: "1024" - OSL: "1024" - CONCURRENCIES: "128x256x1024" # x-separated, looped inside srt_bench.sh - REQ_RATE: "inf" - IS_DISAGGREGATED: "true" - PREFILL_GPUS: "4" # per prefill worker - DECODE_GPUS: "8" # per decode worker - TOTAL_GPUS: "20" # sum across all workers - USE_CHAT_TEMPLATE: "false" # optional, defaults to true + PREFILL_GPUS: "4" # per prefill worker (filename component) + DECODE_GPUS: "8" # per decode worker (filename component) + TOTAL_GPUS: "20" # sum across workers (filename component) + # MODEL_NAME: "..." # only when server's served-model-name + # differs from master-yaml's `model:` + # USE_CHAT_TEMPLATE: "false" # only when overriding default (true) ``` -`benchmarks/multi_node/srt_bench.sh` is a thin wrapper around `utils/bench_serving/benchmark_serving.py` that mirrors sa-bench's per-conc warmup-then-bench loop and writes results to `/logs/sa-bench_isl__osl_/results_concurrency__gpus__ctx_

_gen_.json` so the existing launcher result-harvester picks them up unchanged. See the script's header for the full env-var contract. +`MODEL`, `ISL`, `OSL`, `CONC_LIST`, `DISAGG`, `RANDOM_RANGE_RATIO` are exported by `benchmark-multinode-tmpl.yml` at the workflow step and propagate through the launcher → `srtctl` → `srun` (default `--export=ALL`) → pyxis into the benchmark container, so they don't need to be re-declared in `benchmark.env`. The recipe only carries per-recipe topology knobs (`PREFILL_GPUS`/`DECODE_GPUS`/`TOTAL_GPUS`, used in the result filename) plus the rare overrides (`MODEL_NAME` when the server's served-model-name diverges from `model:`, `USE_CHAT_TEMPLATE: false` for tokenizers that have no chat template, etc.). + +`benchmarks/multi_node/srt_bench.sh` is a thin wrapper around `run_benchmark_serving()` in `benchmarks/benchmark_lib.sh` (the same shim every single-node bench script uses). It loops once per concurrency in `$CONC_LIST` and writes results to `/logs/sa-bench_isl__osl_/results_concurrency__gpus__ctx_

_gen_.json` so existing launcher result-harvesters pick them up unchanged. Tokenizer is loaded from `/model` — `srtctl`'s `RuntimeContext.create` auto-mounts the model dir at that path in every container, so we don't need any HF Hub egress. The `container_mounts` block bind-mounts the host-side `$INFMAX_WORKSPACE` (set by the launcher to `$GITHUB_WORKSPACE`) at `/infmax-workspace` inside srt-slurm's benchmark container, so the wrapper and bench client are reachable at known paths. `srtctl` resolves `$INFMAX_WORKSPACE` via `os.path.expandvars` at submission time. diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index ec2d5a4f1..e42926dde 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -206,6 +206,12 @@ run_benchmark_serving() { local dsv4=false local trust_remote_code=false local server_pid="" + # Optional knobs surfaced for the multi-node srt_bench.sh wrapper so it + # can use this same command-build instead of forking its own. + local endpoint="" + local dataset_name="random" + local dataset_path="" + local tokenizer="" while [[ $# -gt 0 ]]; do case $1 in @@ -270,6 +276,22 @@ run_benchmark_serving() { server_pid="$2" shift 2 ;; + --endpoint) + endpoint="$2" + shift 2 + ;; + --dataset-name) + dataset_name="$2" + shift 2 + ;; + --dataset-path) + dataset_path="$2" + shift 2 + ;; + --tokenizer) + tokenizer="$2" + shift 2 + ;; *) echo "Unknown parameter: $1" return 1 @@ -341,7 +363,7 @@ run_benchmark_serving() { --model "$model" --backend "$backend" --base-url "http://0.0.0.0:$port" - --dataset-name random + --dataset-name "$dataset_name" --random-input-len "$input_len" --random-output-len "$output_len" --random-range-ratio "$random_range_ratio" @@ -356,7 +378,18 @@ run_benchmark_serving() { --result-dir "$result_dir" --result-filename "$result_filename.json" ) - + + # Optional pass-throughs. + if [[ -n "$endpoint" ]]; then + benchmark_cmd+=(--endpoint "$endpoint") + fi + if [[ -n "$dataset_path" ]]; then + benchmark_cmd+=(--dataset-path "$dataset_path") + fi + if [[ -n "$tokenizer" ]]; then + benchmark_cmd+=(--tokenizer "$tokenizer") + fi + # Add --use-chat-template if requested if [[ "$use_chat_template" == true ]]; then benchmark_cmd+=(--use-chat-template) diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/low-latency.yaml index 2f5deea27..b280e7176 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/low-latency.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/low-latency.yaml @@ -109,10 +109,10 @@ backend: expert-parallel-size: 1 # InferenceX bench-serving wrapper, invoked via srt-slurm `benchmark.type: custom`. -# srt_bench.sh lives at /infmax-workspace/benchmarks/multi_node/srt_bench.sh -# inside the benchmark container; the host-side workspace is bind-mounted via -# the container_mounts block below. See benchmarks/multi_node/srt_bench.sh for -# the full env-var contract. +# Most env (MODEL, ISL, OSL, CONC_LIST, DISAGG) is exported by +# benchmark-multinode-tmpl.yml and propagated through srtctl → srun → pyxis, +# so the recipe only carries per-recipe knobs that have no workflow source. +# See benchmarks/multi_node/srt_bench.sh for the full env contract. container_mounts: "$INFMAX_WORKSPACE": "/infmax-workspace" @@ -120,12 +120,9 @@ benchmark: type: "custom" command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" env: + # Override $MODEL because this sglang recipe advertises a different + # served-model-name from what master-yaml's `model:` field is set to. MODEL_NAME: "deepseek-ai/DeepSeek-R1" - ISL: "1024" - OSL: "1024" - CONCURRENCIES: "4x8x32" - REQ_RATE: "inf" - IS_DISAGGREGATED: "true" PREFILL_GPUS: "4" DECODE_GPUS: "4" TOTAL_GPUS: "12" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml index 3ca5ffd12..7e59b1617 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml @@ -105,10 +105,10 @@ backend: num_nextn_predict_layers: 2 # InferenceX bench-serving wrapper, invoked via srt-slurm `benchmark.type: custom`. -# srt_bench.sh lives at /infmax-workspace/benchmarks/multi_node/srt_bench.sh -# inside the benchmark container; the host-side workspace is bind-mounted via -# the container_mounts block below. See benchmarks/multi_node/srt_bench.sh for -# the full env-var contract. +# Most env (MODEL, ISL, OSL, CONC_LIST, DISAGG) is exported by +# benchmark-multinode-tmpl.yml and propagated through srtctl → srun → pyxis, +# so the recipe only carries per-recipe knobs that have no workflow source. +# See benchmarks/multi_node/srt_bench.sh for the full env contract. container_mounts: "$INFMAX_WORKSPACE": "/infmax-workspace" @@ -116,15 +116,9 @@ benchmark: type: "custom" command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" env: - MODEL_NAME: "deepseek-r1-fp4" - ISL: "1024" - OSL: "1024" - CONCURRENCIES: "1214" - REQ_RATE: "inf" - IS_DISAGGREGATED: "true" - PREFILL_GPUS: "4" - DECODE_GPUS: "8" - TOTAL_GPUS: "20" + PREFILL_GPUS: "4" # per prefill worker + DECODE_GPUS: "8" # per decode worker + TOTAL_GPUS: "20" # sum across all workers frontend: nginx_container: "nginx-sqsh" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml index 77da875f6..15790d70f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml @@ -136,10 +136,10 @@ backend: tokenizer-mode: deepseek_v4 # InferenceX bench-serving wrapper, invoked via srt-slurm `benchmark.type: custom`. -# srt_bench.sh lives at /infmax-workspace/benchmarks/multi_node/srt_bench.sh -# inside the benchmark container; the host-side workspace is bind-mounted via -# the container_mounts block below. See benchmarks/multi_node/srt_bench.sh for -# the full env-var contract. +# Most env (MODEL, ISL, OSL, CONC_LIST, DISAGG) is exported by +# benchmark-multinode-tmpl.yml and propagated through srtctl → srun → pyxis, +# so the recipe only carries per-recipe knobs that have no workflow source. +# See benchmarks/multi_node/srt_bench.sh for the full env contract. container_mounts: "$INFMAX_WORKSPACE": "/infmax-workspace" @@ -147,13 +147,7 @@ benchmark: type: "custom" command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" env: - MODEL_NAME: "deepseek-ai/DeepSeek-V4-Pro" - ISL: "1024" - OSL: "1024" - CONCURRENCIES: "1x4x8x16x32x64" - REQ_RATE: "inf" USE_CHAT_TEMPLATE: "false" - IS_DISAGGREGATED: "true" PREFILL_GPUS: "8" DECODE_GPUS: "8" TOTAL_GPUS: "16" diff --git a/benchmarks/multi_node/srt_bench.sh b/benchmarks/multi_node/srt_bench.sh index 418895c0f..9e82a08cb 100755 --- a/benchmarks/multi_node/srt_bench.sh +++ b/benchmarks/multi_node/srt_bench.sh @@ -1,69 +1,79 @@ #!/usr/bin/env bash -# Drop-in replacement for srt-slurm's bundled `sa-bench` benchmark, wired to -# this repo's utils/bench_serving/benchmark_serving.py via srt-slurm's -# `benchmark.type: custom` feature. srt-slurm owns server bring-up; this -# script runs against the already-ready frontend on the head node, then -# writes one results JSON per concurrency to a path the launcher's -# result-harvester recognizes. +# Multi-node bench-serving wrapper invoked by srt-slurm via +# `benchmark.type: custom`. srt-slurm owns server bring-up; this script runs +# inside the same job's benchmark container against the already-ready +# frontend on the head node, then writes one results JSON per concurrency to +# /logs/sa-bench_isl__osl_/ — the same path the launcher's existing +# result-harvesters glob. # -# Required env (set via `benchmark.env` in the recipe yaml): -# ISL OSL CONCURRENCIES MODEL_NAME -# IS_DISAGGREGATED TOTAL_GPUS PREFILL_GPUS DECODE_GPUS +# This is a thin loop on top of run_benchmark_serving() in benchmark_lib.sh +# (the same shim every single-node bench script uses), so any future change +# to bench-serving CLI conventions, profiling, server-health monitoring, etc. +# applies here automatically. # -# Optional env (defaults shown): +# Reads from env. Most of these are *already* exported by +# .github/workflows/benchmark-multinode-tmpl.yml at the workflow step level +# and propagate down through the launcher → srtctl → srun (default +# --export=ALL) → pyxis → bench container, so recipes do not need to +# re-declare them in `benchmark.env`: +# +# $MODEL served-model-name; matches workflow `inputs.model` +# $ISL $OSL sequence lengths +# $CONC_LIST space-separated concurrency list +# $DISAGG "true" / "false" — disagg vs aggregated +# $RANDOM_RANGE_RATIO 0.8 (workflow default) +# +# Per-recipe knobs that *do* live in `benchmark.env` (no workflow equivalent): +# PREFILL_GPUS per-prefill-worker GPU count (filename component) +# DECODE_GPUS per-decode-worker GPU count (filename component) +# TOTAL_GPUS sum across all workers (filename component) +# +# Optional per-recipe overrides (defaults shown): +# MODEL_NAME=$MODEL override when server's served-model-name differs +# from the master-yaml `model:` field # PORT=8000 frontend port reachable at localhost -# REQ_RATE=inf -# RANDOM_RANGE_RATIO=0.8 +# BACKEND=dynamo +# ENDPOINT=/v1/completions # NUM_PROMPTS_MULT=10 prompts per conc = NUM_PROMPTS_MULT * conc -# NUM_WARMUP_MULT=2 warmup prompts per conc = NUM_WARMUP_MULT * conc # USE_CHAT_TEMPLATE=true -# CUSTOM_TOKENIZER= (empty: skip --custom-tokenizer) +# DSV4=false sets the --dsv4 flag (auto-enables chat template) +# TRUST_REMOTE_CODE=true # DATASET_NAME=random -# DATASET_PATH= (only used when DATASET_NAME != random) -# TOKENIZER_PATH=$MODEL_PATH (or container path; falls back to $MODEL_NAME) -# PORT_HEALTH_PATH=/v1/models +# DATASET_PATH= (only meaningful when DATASET_NAME != random) # -# The InferenceX repo is bind-mounted into the container at /infmax-workspace -# (configured by the recipe's `container_mounts` block). This script lives at -# /infmax-workspace/benchmarks/multi_node/srt_bench.sh and shells out to -# /infmax-workspace/utils/bench_serving/benchmark_serving.py. +# The InferenceX repo is bind-mounted at /infmax-workspace via each recipe's +# `container_mounts` block. Model files are auto-mounted at /model by srtctl +# (RuntimeContext.create unconditionally adds the mount when model.path is a +# local path), so we point --tokenizer at /model to load the tokenizer from +# the same files the engine is serving — no HF Hub dependency. set -euo pipefail INFMAX_WS="${INFMAX_CONTAINER_WORKSPACE:-/infmax-workspace}" +# shellcheck disable=SC1091 +source "$INFMAX_WS/benchmarks/benchmark_lib.sh" -require() { - for v in "$@"; do - if [[ -z "${!v:-}" ]]; then - echo "ERROR: required env var '$v' is unset" >&2 - exit 64 - fi - done -} -require ISL OSL CONCURRENCIES MODEL_NAME IS_DISAGGREGATED TOTAL_GPUS +check_env_vars MODEL ISL OSL CONC_LIST DISAGG \ + PREFILL_GPUS DECODE_GPUS TOTAL_GPUS +MODEL_NAME="${MODEL_NAME:-$MODEL}" PORT="${PORT:-8000}" -REQ_RATE="${REQ_RATE:-inf}" +BACKEND="${BACKEND:-dynamo}" +ENDPOINT="${ENDPOINT:-/v1/completions}" RANDOM_RANGE_RATIO="${RANDOM_RANGE_RATIO:-0.8}" NUM_PROMPTS_MULT="${NUM_PROMPTS_MULT:-10}" -NUM_WARMUP_MULT="${NUM_WARMUP_MULT:-2}" USE_CHAT_TEMPLATE="${USE_CHAT_TEMPLATE:-true}" -CUSTOM_TOKENIZER="${CUSTOM_TOKENIZER:-}" +DSV4="${DSV4:-false}" +TRUST_REMOTE_CODE="${TRUST_REMOTE_CODE:-true}" DATASET_NAME="${DATASET_NAME:-random}" DATASET_PATH="${DATASET_PATH:-}" -PREFILL_GPUS="${PREFILL_GPUS:-0}" -DECODE_GPUS="${DECODE_GPUS:-0}" -ENDPOINT="http://localhost:${PORT}" RESULT_DIR="/logs/sa-bench_isl_${ISL}_osl_${OSL}" mkdir -p "$RESULT_DIR" -BENCH_PY="${INFMAX_WS}/utils/bench_serving/benchmark_serving.py" -[[ -f "$BENCH_PY" ]] || { echo "ERROR: benchmark_serving.py not found at $BENCH_PY (mount $INFMAX_WS missing?)" >&2; exit 65; } - -# Bench-serving deps. The srt-slurm worker container ships most of these but -# not all (datasets in particular). Reuse system-site-packages so we don't -# rebuild what's already there. -ensure_deps() { +# srt-slurm worker containers don't always ship bench_serving.py's runtime +# deps (datasets in particular). Install missing ones into a system-site- +# packages venv so we don't perturb the framework's own packages. +ensure_bench_serving_deps() { local deps=(aiohttp numpy pandas datasets Pillow tqdm transformers huggingface_hub) if python3 -c "import aiohttp, numpy, pandas, datasets, PIL, tqdm, transformers, huggingface_hub" 2>/dev/null; then return @@ -74,79 +84,47 @@ ensure_deps() { source "$venv/bin/activate" pip install --quiet "${deps[@]}" } -ensure_deps +ensure_bench_serving_deps -# Verify endpoint -echo "Verifying endpoint at $ENDPOINT ..." -curl -fsS "${ENDPOINT}/v1/models" >/dev/null || { - echo "ERROR: endpoint $ENDPOINT did not respond on /v1/models" >&2 +curl -fsS "http://localhost:${PORT}/v1/models" >/dev/null || { + echo "ERROR: frontend at http://localhost:${PORT} did not respond on /v1/models" >&2 exit 66 } - ulimit -n 65536 2>/dev/null || true -DATASET_ARGS=(--dataset-name "$DATASET_NAME") -[[ -n "$DATASET_PATH" ]] && DATASET_ARGS+=(--dataset-path "$DATASET_PATH") - -RANDOM_LEN_ARGS=() -if [[ "$DATASET_NAME" == "random" ]]; then - RANDOM_LEN_ARGS=( - --random-input-len "$ISL" - --random-output-len "$OSL" - --random-range-ratio "$RANDOM_RANGE_RATIO" - ) -fi - -CHAT_TEMPLATE_ARGS=() -[[ "$USE_CHAT_TEMPLATE" == "true" ]] && CHAT_TEMPLATE_ARGS+=(--use-chat-template) +# CONC_LIST from the workflow is space-separated; bench loops one run per value. +read -r -a CONC_LIST_ARR <<< "$CONC_LIST" -CUSTOM_TOKENIZER_ARGS=() -[[ -n "$CUSTOM_TOKENIZER" ]] && CUSTOM_TOKENIZER_ARGS+=(--custom-tokenizer "$CUSTOM_TOKENIZER") - -# `tokenizer` is required by benchmark_serving.py; pass MODEL_NAME by default -# (HF will fetch). Recipe can override via TOKENIZER_PATH for a local path. -TOKENIZER_PATH="${TOKENIZER_PATH:-$MODEL_NAME}" - -# Concurrency list is "x"-separated for parity with sa-bench. -IFS='x' read -r -a CONC_LIST <<< "$CONCURRENCIES" - -run_bench() { - local conc=$1 - local n_prompts=$2 - local request_rate=$3 - shift 3 - python3 -u "$BENCH_PY" \ - --model "$MODEL_NAME" --tokenizer "$TOKENIZER_PATH" \ - --host localhost --port "$PORT" \ - --backend dynamo --endpoint /v1/completions \ - --disable-tqdm \ - "${DATASET_ARGS[@]}" \ - --num-prompts "$n_prompts" \ - "${RANDOM_LEN_ARGS[@]}" \ - --ignore-eos \ - --request-rate "$request_rate" \ - --percentile-metrics ttft,tpot,itl,e2el \ - --max-concurrency "$conc" \ - --trust-remote-code \ - "${CHAT_TEMPLATE_ARGS[@]}" \ - "${CUSTOM_TOKENIZER_ARGS[@]}" \ - "$@" -} - -for conc in "${CONC_LIST[@]}"; do - echo "=== conc=$conc warmup ===" - run_bench "$conc" "$((conc * NUM_WARMUP_MULT))" 250 || true - - if [[ "$IS_DISAGGREGATED" == "true" ]]; then - result_filename="results_concurrency_${conc}_gpus_${TOTAL_GPUS}_ctx_${PREFILL_GPUS}_gen_${DECODE_GPUS}.json" +for conc in "${CONC_LIST_ARR[@]}"; do + if [[ "$DISAGG" == "true" ]]; then + result_filename="results_concurrency_${conc}_gpus_${TOTAL_GPUS}_ctx_${PREFILL_GPUS}_gen_${DECODE_GPUS}" else - result_filename="results_concurrency_${conc}_gpus_${TOTAL_GPUS}.json" + result_filename="results_concurrency_${conc}_gpus_${TOTAL_GPUS}" fi - - echo "=== conc=$conc bench → $RESULT_DIR/$result_filename ===" - run_bench "$conc" "$((conc * NUM_PROMPTS_MULT))" "$REQ_RATE" \ - --result-dir "$RESULT_DIR" \ + echo "=== conc=$conc → $RESULT_DIR/${result_filename}.json ===" + + args=( + --model "$MODEL_NAME" + --tokenizer /model + --port "$PORT" + --backend "$BACKEND" + --endpoint "$ENDPOINT" + --input-len "$ISL" + --output-len "$OSL" + --random-range-ratio "$RANDOM_RANGE_RATIO" + --num-prompts "$((conc * NUM_PROMPTS_MULT))" + --max-concurrency "$conc" + --dataset-name "$DATASET_NAME" --result-filename "$result_filename" + --result-dir "$RESULT_DIR" + --bench-serving-dir "$INFMAX_WS" + ) + [[ -n "$DATASET_PATH" ]] && args+=(--dataset-path "$DATASET_PATH") + [[ "$USE_CHAT_TEMPLATE" == "true" ]] && args+=(--use-chat-template) + [[ "$DSV4" == "true" ]] && args+=(--dsv4) + [[ "$TRUST_REMOTE_CODE" == "true" ]] && args+=(--trust-remote-code) + + run_benchmark_serving "${args[@]}" done echo "Done. Results in $RESULT_DIR." From adf8a11e095cbb97c25846c4456d6fefc9b339ea Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 28 Apr 2026 15:33:52 -0500 Subject: [PATCH 10/16] srt-slurm: keep run_benchmark_serving pass-throughs to just --tokenizer/--endpoint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Walked back the --dataset-name / --dataset-path additions to run_benchmark_serving — both default cleanly (random / unset) for every multi-node throughput sweep we run, so the pass-throughs were dead weight. srt_bench.sh stops setting DATASET_NAME / DATASET_PATH from env. Kept --tokenizer (srt_bench points it at /model since --model is the served-model-name alias, not a HF id) and --endpoint (recipes may need /v1/chat/completions for chat-template-only request paths). Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/benchmark_lib.sh | 38 +++++++++++------------------- benchmarks/multi_node/srt_bench.sh | 6 ----- 2 files changed, 14 insertions(+), 30 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index e42926dde..ad53360fa 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -206,12 +206,13 @@ run_benchmark_serving() { local dsv4=false local trust_remote_code=false local server_pid="" - # Optional knobs surfaced for the multi-node srt_bench.sh wrapper so it - # can use this same command-build instead of forking its own. - local endpoint="" - local dataset_name="random" - local dataset_path="" + # Optional --tokenizer / --endpoint pass-throughs for the multi-node + # srt_bench.sh. --tokenizer points the bench at the /model auto-mount + # (avoids relying on --model being a HF-resolvable id). --endpoint lets + # recipes target /v1/chat/completions when chat-template-only request + # paths are required. local tokenizer="" + local endpoint="" while [[ $# -gt 0 ]]; do case $1 in @@ -276,22 +277,14 @@ run_benchmark_serving() { server_pid="$2" shift 2 ;; - --endpoint) - endpoint="$2" - shift 2 - ;; - --dataset-name) - dataset_name="$2" - shift 2 - ;; - --dataset-path) - dataset_path="$2" - shift 2 - ;; --tokenizer) tokenizer="$2" shift 2 ;; + --endpoint) + endpoint="$2" + shift 2 + ;; *) echo "Unknown parameter: $1" return 1 @@ -363,7 +356,7 @@ run_benchmark_serving() { --model "$model" --backend "$backend" --base-url "http://0.0.0.0:$port" - --dataset-name "$dataset_name" + --dataset-name random --random-input-len "$input_len" --random-output-len "$output_len" --random-range-ratio "$random_range_ratio" @@ -380,15 +373,12 @@ run_benchmark_serving() { ) # Optional pass-throughs. - if [[ -n "$endpoint" ]]; then - benchmark_cmd+=(--endpoint "$endpoint") - fi - if [[ -n "$dataset_path" ]]; then - benchmark_cmd+=(--dataset-path "$dataset_path") - fi if [[ -n "$tokenizer" ]]; then benchmark_cmd+=(--tokenizer "$tokenizer") fi + if [[ -n "$endpoint" ]]; then + benchmark_cmd+=(--endpoint "$endpoint") + fi # Add --use-chat-template if requested if [[ "$use_chat_template" == true ]]; then diff --git a/benchmarks/multi_node/srt_bench.sh b/benchmarks/multi_node/srt_bench.sh index 9e82a08cb..7b165faf2 100755 --- a/benchmarks/multi_node/srt_bench.sh +++ b/benchmarks/multi_node/srt_bench.sh @@ -38,8 +38,6 @@ # USE_CHAT_TEMPLATE=true # DSV4=false sets the --dsv4 flag (auto-enables chat template) # TRUST_REMOTE_CODE=true -# DATASET_NAME=random -# DATASET_PATH= (only meaningful when DATASET_NAME != random) # # The InferenceX repo is bind-mounted at /infmax-workspace via each recipe's # `container_mounts` block. Model files are auto-mounted at /model by srtctl @@ -64,8 +62,6 @@ NUM_PROMPTS_MULT="${NUM_PROMPTS_MULT:-10}" USE_CHAT_TEMPLATE="${USE_CHAT_TEMPLATE:-true}" DSV4="${DSV4:-false}" TRUST_REMOTE_CODE="${TRUST_REMOTE_CODE:-true}" -DATASET_NAME="${DATASET_NAME:-random}" -DATASET_PATH="${DATASET_PATH:-}" RESULT_DIR="/logs/sa-bench_isl_${ISL}_osl_${OSL}" mkdir -p "$RESULT_DIR" @@ -114,12 +110,10 @@ for conc in "${CONC_LIST_ARR[@]}"; do --random-range-ratio "$RANDOM_RANGE_RATIO" --num-prompts "$((conc * NUM_PROMPTS_MULT))" --max-concurrency "$conc" - --dataset-name "$DATASET_NAME" --result-filename "$result_filename" --result-dir "$RESULT_DIR" --bench-serving-dir "$INFMAX_WS" ) - [[ -n "$DATASET_PATH" ]] && args+=(--dataset-path "$DATASET_PATH") [[ "$USE_CHAT_TEMPLATE" == "true" ]] && args+=(--use-chat-template) [[ "$DSV4" == "true" ]] && args+=(--dsv4) [[ "$TRUST_REMOTE_CODE" == "true" ]] && args+=(--trust-remote-code) From baf8e28ae02efe06ec05f031d9c989358ca8ba1b Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 28 Apr 2026 15:50:56 -0500 Subject: [PATCH 11/16] srt-slurm: compress recipe-resolution block in benchmark template MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Same behavior, fewer lines: collapse the two-step suffix split into a single ${RECIPE#"${RECIPE%%:*}"} parameter expansion. 12 active lines become 5. No semantic change — verified parsing for plain paths, :override, and :zip_override_[N] forms. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../workflows/benchmark-multinode-tmpl.yml | 22 ++++++------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index b6b6a30f3..a8005096b 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -179,22 +179,14 @@ jobs: echo "RESULT_FILENAME=${RESULT_FILENAME}" >> $GITHUB_ENV export ${{ join(fromJson(inputs.prefill-additional-settings), ' ') }} ${{ join(fromJson(inputs.decode-additional-settings), ' ') }} - # Resolve `recipe` (path relative to benchmarks/multi_node/srt-slurm-recipes/, - # optionally ending in `:override[N]`) into an absolute CONFIG_FILE for the - # launcher. Copy the recipe to a scratch path first so the launcher's - # `sed -i` rewrites (job name, health-check timeout, ...) don't mutate the - # tracked file in-place between runs. + # RECIPE = "[:override[N]]" relative to benchmarks/multi_node/srt-slurm-recipes/. + # Copy the file to scratch so the launcher's `sed -i` rewrites don't mutate the + # tracked recipe between concurrent runs; preserve any :override suffix verbatim. if [[ -n "$RECIPE" ]]; then - recipe_path="${RECIPE%%:*}" - recipe_suffix="" - if [[ "$RECIPE" == *:* ]]; then - recipe_suffix=":${RECIPE#*:}" - fi - src="${GITHUB_WORKSPACE}/benchmarks/multi_node/srt-slurm-recipes/${recipe_path}" - scratch_dir="$(mktemp -d)" - scratch_recipe="${scratch_dir}/$(basename "$recipe_path")" - cp "$src" "$scratch_recipe" - export CONFIG_FILE="${scratch_recipe}${recipe_suffix}" + src="${GITHUB_WORKSPACE}/benchmarks/multi_node/srt-slurm-recipes/${RECIPE%%:*}" + scratch="$(mktemp -d)/$(basename "${RECIPE%%:*}")" + cp "$src" "$scratch" + export CONFIG_FILE="${scratch}${RECIPE#"${RECIPE%%:*}"}" fi export IS_MULTINODE=true bash ./runners/launch_${RUNNER_NAME%%_*}.sh From d3e9b932e013178bfdc29b4eb92f0724fa462d0b Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 28 Apr 2026 16:07:46 -0500 Subject: [PATCH 12/16] runners: roll srt-slurm pin back one commit to dodge nginx ulimit regression MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Upstream commit 52e697d (#108 "fix(nginx): raise file descriptor limit for nginx workers") prepends `ulimit -n 1048576 &&` to the nginx srun command. On clusters whose container inherits a sub-1M RLIMIT_NOFILE hard limit from slurmd/PAM, the bash builtin's setrlimit fails with EPERM (raising the hard rlimit needs CAP_SYS_RESOURCE in the init user namespace, which pyxis --container-remap-root does not grant). The `&&` short-circuits and nginx never starts — caught when re-running dsr1-fp4-gb200-dynamo-sglang. Pin back to 698590e ("feat(config): cluster-wide default_bash_preamble for ulimits and the like (#104)"), the immediately prior commit, where nginx runs without the chained ulimit. Bump forward once upstream softens the ulimit to `|| true` or makes it opt-in. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/benchmark_lib.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index ad53360fa..4394b2f32 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -917,7 +917,13 @@ sanitize_image_filename() { # UV_VENV_DIR default .venv (inside the cloned repo) clone_and_install_srtctl() { local repo_url="https://github.com/NVIDIA/srt-slurm.git" - local ref="52e697d595569b1055b3bb436e06408a6f078293" + # 52e697d (#108 fix(nginx): raise file descriptor limit for nginx workers) + # adds an unconditional `ulimit -n 1048576 && nginx` chain that fails with + # EPERM on clusters whose container RLIMIT_NOFILE hard limit is below 1M + # (CAP_SYS_RESOURCE in a user namespace can't raise the hard rlimit past + # what was inherited from slurmd/PAM). Pin to the prior commit until + # upstream softens that to `|| true` or makes the bump opt-in. + local ref="698590e6486b1febb31f8887b240cf84241ca1db" local repo_dir="${SRT_REPO_DIR:-srt-slurm}" local uv_install_dir="${UV_INSTALL_DIR:-${HOME}/.local/bin}" local uv_venv_dir="${UV_VENV_DIR:-.venv}" From 12410868ab7550f17dc2da95b2089ba45f9deb4b Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 28 Apr 2026 16:18:45 -0500 Subject: [PATCH 13/16] runners: bump srt-slurm pin to ishan-rework-nginx (425b486) Replaces the temporary rollback (698590e) with the upstream fix branch. 425b486 is the tip of NVIDIA/srt-slurm's `ishan-rework-nginx`, which makes the nginx ulimit + nginx.conf `worker_rlimit_nofile` directive opt-in via a new `frontend.nginx_raise_ulimit` field (default false). Without us opting in, nginx runs without the EPERM-prone bump that #108 introduced. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/benchmark_lib.sh | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 4394b2f32..d76a7439e 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -917,13 +917,11 @@ sanitize_image_filename() { # UV_VENV_DIR default .venv (inside the cloned repo) clone_and_install_srtctl() { local repo_url="https://github.com/NVIDIA/srt-slurm.git" - # 52e697d (#108 fix(nginx): raise file descriptor limit for nginx workers) - # adds an unconditional `ulimit -n 1048576 && nginx` chain that fails with - # EPERM on clusters whose container RLIMIT_NOFILE hard limit is below 1M - # (CAP_SYS_RESOURCE in a user namespace can't raise the hard rlimit past - # what was inherited from slurmd/PAM). Pin to the prior commit until - # upstream softens that to `|| true` or makes the bump opt-in. - local ref="698590e6486b1febb31f8887b240cf84241ca1db" + # Pinned to ishan-rework-nginx tip — gates the nginx ulimit + worker_rlimit_nofile + # behind an opt-in `frontend.nginx_raise_ulimit` field (default false). #108's + # unconditional `ulimit -n 1048576 && nginx` chain previously crashed clusters + # whose container RLIMIT_NOFILE hard limit was below 1M. + local ref="425b486ce23c6a68ddb57009998a666c0acd0892" local repo_dir="${SRT_REPO_DIR:-srt-slurm}" local uv_install_dir="${UV_INSTALL_DIR:-${HOME}/.local/bin}" local uv_venv_dir="${UV_VENV_DIR:-.venv}" From fecd2de2b50b7a98b839b7df528b72a88550b4c6 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 28 Apr 2026 16:27:58 -0500 Subject: [PATCH 14/16] srt-slurm: default bench backend to `openai`, drop hardcoded /v1/completions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Upstream sa-bench used `--backend dynamo --endpoint /v1/completions`, but this repo's benchmark_serving.py doesn't have a `dynamo` backend choice (it has tgi/vllm/lmdeploy/deepspeed-mii/openai/openai-chat/tensorrt-llm/ scalellm/sglang). The dynamo frontend exposes a generic OpenAI-compatible API regardless of the underlying engine, so `openai` is the right canonical default. Recipes that need /v1/chat/completions can override via ENDPOINT. Also drop the unconditional `--endpoint /v1/completions` — bench_serving.py already defaults to that, and we now only pass --endpoint when ENDPOINT is non-empty (matches single-node bench scripts that don't pass it at all). Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/multi_node/srt_bench.sh | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/benchmarks/multi_node/srt_bench.sh b/benchmarks/multi_node/srt_bench.sh index 7b165faf2..aeb1ef502 100755 --- a/benchmarks/multi_node/srt_bench.sh +++ b/benchmarks/multi_node/srt_bench.sh @@ -32,8 +32,8 @@ # MODEL_NAME=$MODEL override when server's served-model-name differs # from the master-yaml `model:` field # PORT=8000 frontend port reachable at localhost -# BACKEND=dynamo -# ENDPOINT=/v1/completions +# BACKEND=openai generic OpenAI-API; works against the dynamo frontend +# ENDPOINT= empty -> bench_serving.py default (/v1/completions) # NUM_PROMPTS_MULT=10 prompts per conc = NUM_PROMPTS_MULT * conc # USE_CHAT_TEMPLATE=true # DSV4=false sets the --dsv4 flag (auto-enables chat template) @@ -55,8 +55,11 @@ check_env_vars MODEL ISL OSL CONC_LIST DISAGG \ MODEL_NAME="${MODEL_NAME:-$MODEL}" PORT="${PORT:-8000}" -BACKEND="${BACKEND:-dynamo}" -ENDPOINT="${ENDPOINT:-/v1/completions}" +# `openai` matches every dynamo frontend (frontend exposes a generic OpenAI- +# compatible API regardless of the underlying engine). Recipes that need +# /v1/chat/completions can override ENDPOINT. +BACKEND="${BACKEND:-openai}" +ENDPOINT="${ENDPOINT:-}" RANDOM_RANGE_RATIO="${RANDOM_RANGE_RATIO:-0.8}" NUM_PROMPTS_MULT="${NUM_PROMPTS_MULT:-10}" USE_CHAT_TEMPLATE="${USE_CHAT_TEMPLATE:-true}" @@ -104,7 +107,6 @@ for conc in "${CONC_LIST_ARR[@]}"; do --tokenizer /model --port "$PORT" --backend "$BACKEND" - --endpoint "$ENDPOINT" --input-len "$ISL" --output-len "$OSL" --random-range-ratio "$RANDOM_RANGE_RATIO" @@ -114,6 +116,7 @@ for conc in "${CONC_LIST_ARR[@]}"; do --result-dir "$RESULT_DIR" --bench-serving-dir "$INFMAX_WS" ) + [[ -n "$ENDPOINT" ]] && args+=(--endpoint "$ENDPOINT") [[ "$USE_CHAT_TEMPLATE" == "true" ]] && args+=(--use-chat-template) [[ "$DSV4" == "true" ]] && args+=(--dsv4) [[ "$TRUST_REMOTE_CODE" == "true" ]] && args+=(--trust-remote-code) From 24d118f7adb0b9fc1910831f5ee30241a3914659 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 28 Apr 2026 16:55:13 -0500 Subject: [PATCH 15/16] runners: bump srt-slurm pin to NVIDIA/main@1372a10 Both fixes we wanted are now on origin/main: * #110 nginx-rework-ulimit (Ishan): gates the 1M nofile bump behind opt-in frontend.nginx_raise_ulimit. Default off, fixes clusters whose container RLIMIT_NOFILE hard cap < 1M. * #111 (cam): demotes the per-srun command logger.info to logger.debug so the 5KB fingerprint heredoc stops dominating orchestrator logs. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/benchmark_lib.sh | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index d76a7439e..e1d94b1a6 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -917,11 +917,12 @@ sanitize_image_filename() { # UV_VENV_DIR default .venv (inside the cloned repo) clone_and_install_srtctl() { local repo_url="https://github.com/NVIDIA/srt-slurm.git" - # Pinned to ishan-rework-nginx tip — gates the nginx ulimit + worker_rlimit_nofile - # behind an opt-in `frontend.nginx_raise_ulimit` field (default false). #108's - # unconditional `ulimit -n 1048576 && nginx` chain previously crashed clusters - # whose container RLIMIT_NOFILE hard limit was below 1M. - local ref="425b486ce23c6a68ddb57009998a666c0acd0892" + # Pinned to NVIDIA/srt-slurm@main — currently 1372a10. Includes: + # * #110 nginx-rework-ulimit: gates `ulimit -n 1048576` + worker_rlimit_nofile + # behind opt-in `frontend.nginx_raise_ulimit` (we don't opt in). + # * #111 srun command line log demoted INFO -> DEBUG (5KB fingerprint + # heredoc no longer dominates orchestrator log). + local ref="1372a10c493e3fd757f342d8516a5a91c30fe6ce" local repo_dir="${SRT_REPO_DIR:-srt-slurm}" local uv_install_dir="${UV_INSTALL_DIR:-${HOME}/.local/bin}" local uv_venv_dir="${UV_VENV_DIR:-.venv}" From 792d8aa4b8586acb09227d4a04776bb8b956bcd3 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 28 Apr 2026 17:11:46 -0500 Subject: [PATCH 16/16] =?UTF-8?q?srt-slurm:=20migrate=20remaining=20364=20?= =?UTF-8?q?recipes=20from=20sa-bench=20=E2=86=92=20custom?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that the proof-of-life recipe (dsr1-fp4-gb200-dynamo-sglang low-latency, conc 4/8/32) ran clean end-to-end on a real cluster, sweep the rest of the tree onto the new shape so all multi-node throughput sweeps drive utils/bench_serving/benchmark_serving.py via benchmarks/multi_node/srt_bench.sh instead of srt-slurm's bundled sa-bench client. Each migrated recipe replaces: benchmark: type: "sa-bench" isl: … osl: … concurrencies: … req_rate: … [use_chat_template: false] with: container_mounts: "$INFMAX_WORKSPACE": "/infmax-workspace" benchmark: type: "custom" command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" env: [MODEL_NAME: "..."] # only when server's served-model-name diverges # from the master-yaml `model:` value PREFILL_GPUS: "..." # per prefill worker (filename component) DECODE_GPUS: "..." # per decode worker (filename component) TOTAL_GPUS: "..." # sum across all workers (filename component) [USE_CHAT_TEMPLATE: "false"] # only carried over when set in original GPU counts derived from each recipe's `resources:` block — uses gpus_per_prefill / gpus_per_decode when set, else falls back to nodes * gpus_per_node / workers. MODEL_NAME override added on the 59 sglang recipes whose backend.sglang_config.served-model-name is "deepseek-ai/DeepSeek-R1" while master-yaml `model:` is the more specific "deepseek-ai/DeepSeek-R1-0528" / "nvidia/DeepSeek-R1-0528-NVFP4-v2" revision tag. Skipped: - 3 sglang multi-override base files (1k1k.yaml / 8k1k.yaml under dsr1/sglang/b200-fp{4,8}/) — their `benchmark:` lives nested under `base:` and gets sa-bench-style overrides per `:override[N]` reference. Migrating them needs a separate pass that handles the override-merge semantics; their 26 master-yaml refs continue to dispatch via srt-slurm's bundled sa-bench until then. Tracked as follow-up. Validation: schema accepts all 81 master-yaml entries, 149/149 tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml | 17 ++++++++++++----- .../8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml | 17 ++++++++++++----- .../8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml | 17 ++++++++++++----- .../8k1k/disagg/mtp/8k1k_mtp_maxtpt_0.yaml | 17 ++++++++++++----- .../8k1k/disagg/mtp/8k1k_mtp_maxtpt_1.yaml | 17 ++++++++++++----- .../8k1k/disagg/mtp/8k1k_mtp_maxtpt_2.yaml | 17 ++++++++++++----- .../8k1k/disagg/mtp/8k1k_mtp_maxtpt_3.yaml | 17 ++++++++++++----- .../8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml | 17 ++++++++++++----- .../8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml | 17 ++++++++++++----- .../8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml | 17 ++++++++++++----- .../8k1k/disagg/stp/8k1k_stp_maxtpt_0.yaml | 17 ++++++++++++----- .../8k1k/disagg/stp/8k1k_stp_maxtpt_1.yaml | 17 ++++++++++++----- .../8k1k/disagg/stp/8k1k_stp_maxtpt_2.yaml | 17 ++++++++++++----- .../8k1k/disagg/stp/8k1k_stp_maxtpt_3.yaml | 17 ++++++++++++----- .../gb200-fp4/1k1k/disagg/stp/max-tpt.yaml | 17 ++++++++++++----- .../gb200-fp4/1k1k/disagg/stp/mid-curve.yaml | 17 ++++++++++++----- .../gb200-fp4/8k1k/disagg/stp/low-latency.yaml | 17 ++++++++++++----- .../gb200-fp4/8k1k/disagg/stp/max-tpt.yaml | 17 ++++++++++++----- .../gb200-fp4/8k1k/disagg/stp/mid-curve.yaml | 17 ++++++++++++----- .../gb200-fp8/1k1k/disagg/stp/low-latency.yaml | 17 ++++++++++++----- .../gb200-fp8/1k1k/disagg/stp/max-tpt.yaml | 17 ++++++++++++----- .../gb200-fp8/1k1k/disagg/stp/mid-curve.yaml | 17 ++++++++++++----- .../gb200-fp8/1k1k/disagg/stp/ultra-tpt.yaml | 17 ++++++++++++----- .../gb200-fp8/8k1k/disagg/stp/low-latency.yaml | 17 ++++++++++++----- .../gb200-fp8/8k1k/disagg/stp/max_tpt.yaml | 17 ++++++++++++----- .../gb200-fp8/8k1k/disagg/stp/mid-curve.yaml | 17 ++++++++++++----- .../gb300-fp4/1k1k/disagg/stp/low_latency.yaml | 17 ++++++++++++----- .../gb300-fp4/1k1k/disagg/stp/max_tpt.yaml | 17 ++++++++++++----- .../gb300-fp4/1k1k/disagg/stp/mid_curve.yaml | 17 ++++++++++++----- .../gb300-fp4/8k1k/disagg/stp/low_latency.yaml | 17 ++++++++++++----- .../gb300-fp4/8k1k/disagg/stp/max_tpt.yaml | 17 ++++++++++++----- .../gb300-fp4/8k1k/disagg/stp/mid_curve.yaml | 17 ++++++++++++----- .../gb300-fp8/1k1k/disagg/stp/low-latency.yaml | 17 ++++++++++++----- .../sglang/gb300-fp8/1k1k/disagg/stp/max.yaml | 17 ++++++++++++----- .../sglang/gb300-fp8/1k1k/disagg/stp/mid.yaml | 17 ++++++++++++----- .../gb300-fp8/8k1k/disagg/stp/low-latency.yaml | 17 ++++++++++++----- .../sglang/gb300-fp8/8k1k/disagg/stp/max.yaml | 17 ++++++++++++----- .../sglang/gb300-fp8/8k1k/disagg/stp/mid.yaml | 17 ++++++++++++----- .../disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml | 17 ++++++++++++----- .../disagg/mtp/h100-fp8-1p2d-max-tp-mtp.yaml | 17 ++++++++++++----- .../1k1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml | 17 ++++++++++++----- .../1k1k/disagg/stp/h100-fp8-1p2d-max-tp.yaml | 17 ++++++++++++----- .../disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml | 17 ++++++++++++----- .../disagg/mtp/h100-fp8-1p1d-max-tp-mtp.yaml | 17 ++++++++++++----- .../8k1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml | 17 ++++++++++++----- .../8k1k/disagg/stp/h100-fp8-1p1d-max-tp.yaml | 17 ++++++++++++----- .../1k1k/disagg/mtp/bs256-1p6d-dep-mtp.yaml | 17 ++++++++++++----- .../1k1k/disagg/mtp/bs256-1p6d-tp-mtp.yaml | 18 ++++++++++++------ .../1k1k/disagg/mtp/low-latency-1p9d-mtp.yaml | 17 ++++++++++++----- .../1k1k/disagg/stp/bs256-1p6d-dep.yaml | 17 ++++++++++++----- .../1k1k/disagg/stp/bs256-1p6d-tp.yaml | 18 ++++++++++++------ .../1k1k/disagg/stp/low-latency-1p9d.yaml | 17 ++++++++++++----- .../8k1k/disagg/mtp/bs128-1p1d-dep-mtp.yaml | 17 ++++++++++++----- .../8k1k/disagg/mtp/bs16-1p3d-mtp.yaml | 17 ++++++++++++----- .../h200-fp8/8k1k/disagg/mtp/bs4-1p7d-mtp.yaml | 17 ++++++++++++----- .../8k1k/disagg/mtp/bs64-2p3d-mtp.yaml | 17 ++++++++++++----- .../h200-fp8/8k1k/disagg/mtp/bs8-1p6d-mtp.yaml | 17 ++++++++++++----- .../8k1k/disagg/stp/bs128-1p1d-dep.yaml | 17 ++++++++++++----- .../h200-fp8/8k1k/disagg/stp/bs16-1p3d.yaml | 17 ++++++++++++----- .../h200-fp8/8k1k/disagg/stp/bs4-1p7d.yaml | 17 ++++++++++++----- .../h200-fp8/8k1k/disagg/stp/bs64-2p3d.yaml | 17 ++++++++++++----- .../h200-fp8/8k1k/disagg/stp/bs8-1p6d.yaml | 17 ++++++++++++----- .../mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml | 16 +++++++++++----- .../mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml | 16 +++++++++++----- .../mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml | 16 +++++++++++----- .../ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml | 16 +++++++++++----- .../ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml | 16 +++++++++++----- .../ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml | 16 +++++++++++----- .../ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml | 16 +++++++++++----- .../stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml | 16 +++++++++++----- .../stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml | 16 +++++++++++----- .../stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml | 16 +++++++++++----- .../stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml | 16 +++++++++++----- .../mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml | 16 +++++++++++----- .../mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml | 16 +++++++++++----- .../mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml | 16 +++++++++++----- .../mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml | 16 +++++++++++----- .../mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml | 16 +++++++++++----- .../ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml | 16 +++++++++++----- .../mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml | 16 +++++++++++----- .../stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml | 16 +++++++++++----- .../stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml | 16 +++++++++++----- .../stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml | 16 +++++++++++----- .../ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml | 16 +++++++++++----- .../stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml | 16 +++++++++++----- .../ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml | 16 +++++++++++----- ...tx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml | 16 +++++++++++----- ...tx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml | 16 +++++++++++----- ...tx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml | 16 +++++++++++----- ...ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml | 16 +++++++++++----- .../mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml | 16 +++++++++++----- .../ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml | 16 +++++++++++----- .../ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml | 16 +++++++++++----- .../ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml | 16 +++++++++++----- ...tx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml | 16 +++++++++++----- ...ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml | 16 +++++++++++----- .../ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml | 16 +++++++++++----- .../ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml | 16 +++++++++++----- ...ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml | 16 +++++++++++----- ...tx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml | 16 +++++++++++----- .../ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml | 16 +++++++++++----- .../ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml | 16 +++++++++++----- .../ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml | 16 +++++++++++----- .../mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml | 16 +++++++++++----- .../ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml | 16 +++++++++++----- .../ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml | 16 +++++++++++----- ...tx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml | 16 +++++++++++----- ...ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml | 16 +++++++++++----- ...ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml | 16 +++++++++++----- .../stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml | 16 +++++++++++----- .../ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml | 16 +++++++++++----- .../ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml | 16 +++++++++++----- .../ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml | 16 +++++++++++----- .../ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml | 16 +++++++++++----- ...ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml | 16 +++++++++++----- .../mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml | 16 +++++++++++----- .../mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml | 16 +++++++++++----- .../mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml | 16 +++++++++++----- .../mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml | 16 +++++++++++----- .../ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml | 16 +++++++++++----- .../ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml | 16 +++++++++++----- .../ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml | 16 +++++++++++----- .../stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml | 16 +++++++++++----- .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml | 16 +++++++++++----- .../stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml | 16 +++++++++++----- .../stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml | 16 +++++++++++----- .../ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml | 16 +++++++++++----- .../ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml | 16 +++++++++++----- .../ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml | 16 +++++++++++----- .../ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml | 16 +++++++++++----- .../mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml | 16 +++++++++++----- .../mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml | 16 +++++++++++----- .../mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml | 16 +++++++++++----- .../mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml | 16 +++++++++++----- .../ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml | 16 +++++++++++----- .../stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml | 16 +++++++++++----- .../stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml | 16 +++++++++++----- .../stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml | 16 +++++++++++----- .../stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml | 16 +++++++++++----- .../stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml | 16 +++++++++++----- .../ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml | 16 +++++++++++----- .../ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml | 16 +++++++++++----- ...ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml | 16 +++++++++++----- ...tx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml | 16 +++++++++++----- .../ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml | 16 +++++++++++----- .../ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml | 16 +++++++++++----- .../ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml | 16 +++++++++++----- ...tx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml | 16 +++++++++++----- ...tx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml | 16 +++++++++++----- ...ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml | 16 +++++++++++----- .../ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml | 16 +++++++++++----- .../ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml | 16 +++++++++++----- .../ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml | 16 +++++++++++----- .../ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml | 16 +++++++++++----- ...x2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml | 16 +++++++++++----- .../ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml | 16 +++++++++++----- .../ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml | 16 +++++++++++----- .../mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml | 16 +++++++++++----- .../ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml | 16 +++++++++++----- .../ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml | 16 +++++++++++----- .../ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml | 16 +++++++++++----- .../ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml | 16 +++++++++++----- .../ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml | 16 +++++++++++----- .../ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml | 16 +++++++++++----- .../ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml | 16 +++++++++++----- .../ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml | 16 +++++++++++----- ...ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml | 16 +++++++++++----- ...tx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml | 16 +++++++++++----- .../mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml | 16 +++++++++++----- .../mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml | 16 +++++++++++----- .../ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml | 16 +++++++++++----- .../ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml | 16 +++++++++++----- .../ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml | 16 +++++++++++----- .../ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml | 16 +++++++++++----- .../ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml | 16 +++++++++++----- .../ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml | 16 +++++++++++----- .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml | 16 +++++++++++----- .../stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml | 16 +++++++++++----- .../ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml | 16 +++++++++++----- .../ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml | 16 +++++++++++----- ...ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml | 16 +++++++++++----- .../mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml | 16 +++++++++++----- .../mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml | 16 +++++++++++----- .../ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml | 16 +++++++++++----- .../ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml | 16 +++++++++++----- ...ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml | 16 +++++++++++----- .../stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml | 16 +++++++++++----- .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml | 16 +++++++++++----- .../stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml | 16 +++++++++++----- .../ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml | 16 +++++++++++----- .../ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml | 16 +++++++++++----- ...tx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml | 16 +++++++++++----- ...ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml | 16 +++++++++++----- ...tx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml | 16 +++++++++++----- ...tx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml | 16 +++++++++++----- .../ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml | 16 +++++++++++----- .../ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml | 16 +++++++++++----- .../ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml | 16 +++++++++++----- ...x1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml | 16 +++++++++++----- ...tx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml | 16 +++++++++++----- .../ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml | 16 +++++++++++----- ...tx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml | 16 +++++++++++----- ...tx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml | 16 +++++++++++----- .../ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml | 16 +++++++++++----- .../ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml | 16 +++++++++++----- .../ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml | 16 +++++++++++----- .../ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml | 16 +++++++++++----- .../ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml | 16 +++++++++++----- ...ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml | 16 +++++++++++----- .../ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml | 16 +++++++++++----- .../ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml | 16 +++++++++++----- ...ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml | 16 +++++++++++----- .../ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml | 16 +++++++++++----- .../ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml | 16 +++++++++++----- .../ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml | 16 +++++++++++----- .../ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml | 16 +++++++++++----- ...ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml | 16 +++++++++++----- ...ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml | 16 +++++++++++----- ...tx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml | 16 +++++++++++----- .../mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml | 16 +++++++++++----- .../ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml | 16 +++++++++++----- .../mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml | 16 +++++++++++----- .../mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml | 16 +++++++++++----- .../ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml | 16 +++++++++++----- .../ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml | 16 +++++++++++----- .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml | 16 +++++++++++----- .../stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml | 16 +++++++++++----- .../ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml | 16 +++++++++++----- .../ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml | 16 +++++++++++----- .../ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml | 16 +++++++++++----- .../ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml | 16 +++++++++++----- .../ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml | 16 +++++++++++----- .../ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml | 16 +++++++++++----- .../ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml | 16 +++++++++++----- .../mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml | 16 +++++++++++----- .../mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml | 16 +++++++++++----- .../mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml | 16 +++++++++++----- .../mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml | 16 +++++++++++----- .../mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml | 16 +++++++++++----- .../ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml | 16 +++++++++++----- .../ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml | 16 +++++++++++----- .../stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml | 16 +++++++++++----- .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml | 16 +++++++++++----- .../stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml | 16 +++++++++++----- .../stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml | 16 +++++++++++----- .../ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml | 16 +++++++++++----- .../ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml | 16 +++++++++++----- ...ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml | 16 +++++++++++----- .../ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml | 16 +++++++++++----- .../ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml | 16 +++++++++++----- .../ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml | 16 +++++++++++----- ...x2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml | 16 +++++++++++----- ...ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml | 16 +++++++++++----- ...tx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml | 16 +++++++++++----- .../ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml | 16 +++++++++++----- .../ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml | 16 +++++++++++----- .../ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml | 16 +++++++++++----- ...x2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml | 16 +++++++++++----- ...tx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml | 16 +++++++++++----- ...tx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml | 16 +++++++++++----- ...x3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml | 16 +++++++++++----- ...x10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml | 16 +++++++++++----- .../ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml | 16 +++++++++++----- .../ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml | 16 +++++++++++----- .../ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml | 16 +++++++++++----- ...tx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml | 16 +++++++++++----- ...ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml | 16 +++++++++++----- .../ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml | 16 +++++++++++----- .../ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml | 16 +++++++++++----- .../ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml | 16 +++++++++++----- ...ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml | 16 +++++++++++----- ...ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml | 16 +++++++++++----- ...tx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml | 16 +++++++++++----- ...tx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml | 16 +++++++++++----- .../ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml | 16 +++++++++++----- .../ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml | 16 +++++++++++----- .../mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml | 16 +++++++++++----- .../ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml | 16 +++++++++++----- .../ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml | 16 +++++++++++----- .../mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml | 16 +++++++++++----- .../mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml | 16 +++++++++++----- .../ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml | 16 +++++++++++----- .../mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml | 16 +++++++++++----- .../ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml | 16 +++++++++++----- .../ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml | 16 +++++++++++----- .../stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml | 16 +++++++++++----- .../stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml | 16 +++++++++++----- .../ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml | 16 +++++++++++----- .../stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml | 16 +++++++++++----- .../stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml | 16 +++++++++++----- .../stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml | 16 +++++++++++----- .../ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml | 16 +++++++++++----- .../mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml | 16 +++++++++++----- .../ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml | 16 +++++++++++----- .../mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml | 16 +++++++++++----- .../mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml | 16 +++++++++++----- .../mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml | 16 +++++++++++----- .../mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml | 16 +++++++++++----- .../ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml | 16 +++++++++++----- .../stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml | 16 +++++++++++----- .../stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml | 16 +++++++++++----- .../stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml | 16 +++++++++++----- .../ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml | 16 +++++++++++----- ...128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml | 16 +++++++++++----- ...c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml | 16 +++++++++++----- .../c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml | 16 +++++++++++----- ...256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml | 16 +++++++++++----- ...32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml | 16 +++++++++++----- ...c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml | 16 +++++++++++----- ...512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml | 16 +++++++++++----- ...c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml | 16 +++++++++++----- ...c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml | 16 +++++++++++----- ...128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml | 16 +++++++++++----- ...c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml | 16 +++++++++++----- .../c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml | 16 +++++++++++----- ...256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml | 16 +++++++++++----- ...c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml | 16 +++++++++++----- .../c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml | 16 +++++++++++----- ...512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml | 16 +++++++++++----- ...c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml | 16 +++++++++++----- .../c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml | 16 +++++++++++----- ...c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml | 16 +++++++++++----- .../c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml | 16 +++++++++++----- .../c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml | 16 +++++++++++----- ...c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml | 16 +++++++++++----- .../c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml | 16 +++++++++++----- .../c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml | 16 +++++++++++----- ...c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml | 16 +++++++++++----- .../c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml | 16 +++++++++++----- .../c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml | 16 +++++++++++----- ...128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml | 16 +++++++++++----- .../c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml | 16 +++++++++++----- .../c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml | 16 +++++++++++----- ...256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml | 16 +++++++++++----- ...c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml | 16 +++++++++++----- .../c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml | 16 +++++++++++----- ...512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml | 16 +++++++++++----- ...c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml | 16 +++++++++++----- .../c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml | 16 +++++++++++----- .../stp/disagg-gb200-1p1d-dep8-dep16.yaml | 18 ++++++++++++------ .../stp/disagg-gb200-3p1d-dep8-dep16.yaml | 18 ++++++++++++------ .../stp/disagg-gb200-1p1d-dep8-tep8.yaml | 18 ++++++++++++------ .../stp/disagg-gb200-3p1d-dep8-dep16.yaml | 18 ++++++++++++------ .../stp/disagg-gb200-7p1d-dep8-dep16.yaml | 18 ++++++++++++------ .../ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml | 16 +++++++++++----- .../ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml | 16 +++++++++++----- ...4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml | 16 +++++++++++----- ...4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml | 16 +++++++++++----- ...ep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml | 16 +++++++++++----- ...ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml | 16 +++++++++++----- ...ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml | 16 +++++++++++----- ...p4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml | 16 +++++++++++----- ...ep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml | 16 +++++++++++----- ...p4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml | 16 +++++++++++----- .../ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml | 16 +++++++++++----- .../ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml | 16 +++++++++++----- ...4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml | 16 +++++++++++----- ...ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml | 16 +++++++++++----- .../stp/disagg-gb200-1p1d-dep4-dep16.yaml | 16 +++++++++++----- .../stp/disagg-gb200-1p4d-dep4-tep4.yaml | 16 +++++++++++----- .../stp/disagg-gb200-1p4d-dep4-tep4.yaml | 16 +++++++++++----- .../stp/disagg-gb200-3p1d-dep4-dep16.yaml | 16 +++++++++++----- .../stp/disagg-gb200-5p1d-dep4-dep8.yaml | 16 +++++++++++----- .../stp/disagg-gb200-6p1d-dep4-dep16.yaml | 16 +++++++++++----- 364 files changed, 4071 insertions(+), 1827 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml index 3c1f465fa..36b78e975 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml @@ -133,9 +133,16 @@ health_check: max_attempts: 720 interval_seconds: 10 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 8192 - osl: 1024 - req_rate: inf - concurrencies: '128' + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "32" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml index 51671712c..0fed3f9a6 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml @@ -133,9 +133,16 @@ health_check: max_attempts: 720 interval_seconds: 10 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 8192 - osl: 1024 - req_rate: inf - concurrencies: '128' + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "40" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml index 27dbbe30d..e39611a4b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml @@ -133,9 +133,16 @@ health_check: max_attempts: 720 interval_seconds: 10 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 8192 - osl: 1024 - req_rate: inf - concurrencies: 8x16x32x64x128 + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "56" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_0.yaml index e5eefa2d2..78dc57d5a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_0.yaml @@ -136,9 +136,16 @@ health_check: max_attempts: 720 interval_seconds: 10 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 8192 - osl: 1024 - req_rate: inf - concurrencies: '288' + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "24" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_1.yaml index fe0cd9a9f..202a10631 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_1.yaml @@ -136,9 +136,16 @@ health_check: max_attempts: 720 interval_seconds: 10 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 8192 - osl: 1024 - req_rate: inf - concurrencies: 160x288 + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "16" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_2.yaml index 7d050ff12..e2a619e29 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_2.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_2.yaml @@ -136,9 +136,16 @@ health_check: max_attempts: 720 interval_seconds: 10 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 8192 - osl: 1024 - req_rate: inf - concurrencies: '512' + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "24" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_3.yaml index e687ccf84..5e959ca38 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_3.yaml @@ -136,9 +136,16 @@ health_check: max_attempts: 720 interval_seconds: 10 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 8192 - osl: 1024 - req_rate: inf - concurrencies: '1024' + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "32" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml index 894cef0c7..24d37e3ee 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml @@ -131,9 +131,16 @@ health_check: max_attempts: 360 interval_seconds: 10 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 8192 - osl: 1024 - req_rate: inf - concurrencies: '128' + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "32" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml index c05382ef8..c97d109d9 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml @@ -131,9 +131,16 @@ health_check: max_attempts: 360 interval_seconds: 10 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 8192 - osl: 1024 - req_rate: inf - concurrencies: '128' + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "40" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml index 69e36a289..503f1363b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml @@ -131,9 +131,16 @@ health_check: max_attempts: 360 interval_seconds: 10 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 8192 - osl: 1024 - req_rate: inf - concurrencies: 8x16x32x64x128 + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "56" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_0.yaml index 9846a1f05..cb8d13717 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_0.yaml @@ -132,9 +132,16 @@ health_check: max_attempts: 360 interval_seconds: 10 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 8192 - osl: 1024 - req_rate: inf - concurrencies: '288' + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "24" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_1.yaml index e4eccdeab..875893e72 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_1.yaml @@ -132,9 +132,16 @@ health_check: max_attempts: 360 interval_seconds: 10 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 8192 - osl: 1024 - req_rate: inf - concurrencies: 160x288 + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "16" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_2.yaml index c4cc2dd33..1402c1202 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_2.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_2.yaml @@ -132,9 +132,16 @@ health_check: max_attempts: 360 interval_seconds: 10 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 8192 - osl: 1024 - req_rate: inf - concurrencies: '512' + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "24" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_3.yaml index 59cbb8197..a689bf0ac 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_3.yaml @@ -132,9 +132,16 @@ health_check: max_attempts: 360 interval_seconds: 10 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 8192 - osl: 1024 - req_rate: inf - concurrencies: '1024' + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "32" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/max-tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/max-tpt.yaml index 1075c93eb..eb499618e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/max-tpt.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/max-tpt.yaml @@ -175,9 +175,16 @@ backend: dp-size: 48 ep-size: 48 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "2048x4096" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "4" + DECODE_GPUS: "48" + TOTAL_GPUS: "64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/mid-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/mid-curve.yaml index d8c80dea7..fdfce3821 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/mid-curve.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/mid-curve.yaml @@ -174,9 +174,16 @@ backend: dp-size: 32 ep-size: 32 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "512x2048x4096x8192" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "4" + DECODE_GPUS: "32" + TOTAL_GPUS: "48" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k1k/disagg/stp/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k1k/disagg/stp/low-latency.yaml index 14ebda144..48b044bd3 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k1k/disagg/stp/low-latency.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k1k/disagg/stp/low-latency.yaml @@ -110,9 +110,16 @@ backend: expert-parallel-size: 1 enable-dp-attention: false +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "4x8" - req_rate: 300 + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "4" + DECODE_GPUS: "4" + TOTAL_GPUS: "20" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k1k/disagg/stp/max-tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k1k/disagg/stp/max-tpt.yaml index cf2759871..cbf43343b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k1k/disagg/stp/max-tpt.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k1k/disagg/stp/max-tpt.yaml @@ -171,9 +171,16 @@ backend: dp-size: 32 ep-size: 32 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "2048" - req_rate: 700 + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "4" + DECODE_GPUS: "32" + TOTAL_GPUS: "72" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k1k/disagg/stp/mid-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k1k/disagg/stp/mid-curve.yaml index 8380eb5bf..39f9ab7c8 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k1k/disagg/stp/mid-curve.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k1k/disagg/stp/mid-curve.yaml @@ -171,9 +171,16 @@ backend: dp-size: 48 ep-size: 48 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "512x2048x4096" - req_rate: 700 + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "4" + DECODE_GPUS: "48" + TOTAL_GPUS: "72" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/low-latency.yaml index 155d1664c..5dc0c0c73 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/low-latency.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/low-latency.yaml @@ -113,9 +113,16 @@ backend: disaggregation-bootstrap-port: 30001 disaggregation-transfer-backend: nixl +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4x8" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "4" + DECODE_GPUS: "4" + TOTAL_GPUS: "8" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/max-tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/max-tpt.yaml index 5d3c91794..c7a9e0923 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/max-tpt.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/max-tpt.yaml @@ -166,10 +166,17 @@ backend: disaggregation-transfer-backend: nixl +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1024x2048x4096x6144" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "32" + TOTAL_GPUS: "48" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/mid-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/mid-curve.yaml index 1f83ed1bd..0de49d6d7 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/mid-curve.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/mid-curve.yaml @@ -165,10 +165,17 @@ backend: disaggregation-transfer-backend: nixl +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1024x2048x4096" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "48" + TOTAL_GPUS: "72" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/ultra-tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/ultra-tpt.yaml index 08fe2fa90..f335aa042 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/ultra-tpt.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/ultra-tpt.yaml @@ -167,10 +167,17 @@ backend: disaggregation-transfer-backend: nixl +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4096" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "16" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k1k/disagg/stp/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k1k/disagg/stp/low-latency.yaml index 368b03409..94ee5ed1f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k1k/disagg/stp/low-latency.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k1k/disagg/stp/low-latency.yaml @@ -109,9 +109,16 @@ backend: disaggregation-bootstrap-port: 30001 disaggregation-transfer-backend: nixl +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "4x8x16" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "16" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k1k/disagg/stp/max_tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k1k/disagg/stp/max_tpt.yaml index f03e34b8d..2865f2e52 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k1k/disagg/stp/max_tpt.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k1k/disagg/stp/max_tpt.yaml @@ -163,9 +163,16 @@ backend: disaggregation-bootstrap-port: 30001 disaggregation-transfer-backend: nixl +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "2048x4096x6144" - req_rate: "300" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "24" + TOTAL_GPUS: "72" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k1k/disagg/stp/mid-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k1k/disagg/stp/mid-curve.yaml index c822d67f3..a1559e71d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k1k/disagg/stp/mid-curve.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k1k/disagg/stp/mid-curve.yaml @@ -162,9 +162,16 @@ backend: disaggregation-bootstrap-port: 30001 disaggregation-transfer-backend: nixl +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "512x1024x2048x6144" - req_rate: "300" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "32" + TOTAL_GPUS: "72" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k1k/disagg/stp/low_latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k1k/disagg/stp/low_latency.yaml index 252eafa2b..c531f8446 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k1k/disagg/stp/low_latency.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k1k/disagg/stp/low_latency.yaml @@ -108,9 +108,16 @@ backend: fp4-gemm-backend: "flashinfer_trtllm" disaggregation-transfer-backend: nixl +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4x8x32" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "4" + DECODE_GPUS: "4" + TOTAL_GPUS: "12" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k1k/disagg/stp/max_tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k1k/disagg/stp/max_tpt.yaml index c941651aa..c4a3d6524 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k1k/disagg/stp/max_tpt.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k1k/disagg/stp/max_tpt.yaml @@ -176,9 +176,16 @@ backend: dp-size: 48 ep-size: 48 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "512x2048x4096x8192" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "4" + DECODE_GPUS: "48" + TOTAL_GPUS: "64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k1k/disagg/stp/mid_curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k1k/disagg/stp/mid_curve.yaml index 15d3b3930..e6d388906 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k1k/disagg/stp/mid_curve.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k1k/disagg/stp/mid_curve.yaml @@ -174,9 +174,16 @@ backend: dp-size: 32 ep-size: 32 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "512x2048x4096x8192" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "4" + DECODE_GPUS: "32" + TOTAL_GPUS: "48" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k1k/disagg/stp/low_latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k1k/disagg/stp/low_latency.yaml index d3c61231b..5c95e1ffa 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k1k/disagg/stp/low_latency.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k1k/disagg/stp/low_latency.yaml @@ -111,9 +111,16 @@ backend: fp4-gemm-backend: "flashinfer_trtllm" disaggregation-transfer-backend: nixl +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "4x8x32x64" - req_rate: 300 + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "4" + DECODE_GPUS: "4" + TOTAL_GPUS: "20" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k1k/disagg/stp/max_tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k1k/disagg/stp/max_tpt.yaml index 001311ed7..29a619a6f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k1k/disagg/stp/max_tpt.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k1k/disagg/stp/max_tpt.yaml @@ -171,9 +171,16 @@ backend: dp-size: 32 ep-size: 32 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "2048" - req_rate: 700 + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "4" + DECODE_GPUS: "32" + TOTAL_GPUS: "72" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k1k/disagg/stp/mid_curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k1k/disagg/stp/mid_curve.yaml index 41043ed0d..b4de76bb9 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k1k/disagg/stp/mid_curve.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k1k/disagg/stp/mid_curve.yaml @@ -171,9 +171,16 @@ backend: dp-size: 48 ep-size: 48 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "512x2048x4096" - req_rate: 700 + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "4" + DECODE_GPUS: "48" + TOTAL_GPUS: "72" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k1k/disagg/stp/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k1k/disagg/stp/low-latency.yaml index 51628e081..57ea3ff5e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k1k/disagg/stp/low-latency.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k1k/disagg/stp/low-latency.yaml @@ -113,10 +113,17 @@ backend: data-parallel-size: 1 expert-parallel-size: 1 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: [4,8,16,32] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "4" + DECODE_GPUS: "4" + TOTAL_GPUS: "20" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k1k/disagg/stp/max.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k1k/disagg/stp/max.yaml index c88a487b8..d27830a5f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k1k/disagg/stp/max.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k1k/disagg/stp/max.yaml @@ -162,10 +162,17 @@ backend: cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 1024] cuda-graph-max-bs: 1024 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: [4096,7168,7680] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "16" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k1k/disagg/stp/mid.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k1k/disagg/stp/mid.yaml index ee6690285..507f5607a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k1k/disagg/stp/mid.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k1k/disagg/stp/mid.yaml @@ -161,10 +161,17 @@ backend: cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768] cuda-graph-max-bs: 768 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: [1024,2048,4096,6144] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "32" + TOTAL_GPUS: "48" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k1k/disagg/stp/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k1k/disagg/stp/low-latency.yaml index 71fd0f889..766ecc632 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k1k/disagg/stp/low-latency.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k1k/disagg/stp/low-latency.yaml @@ -113,9 +113,16 @@ backend: data-parallel-size: 1 expert-parallel-size: 1 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: [4,8] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "4" + DECODE_GPUS: "4" + TOTAL_GPUS: "8" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k1k/disagg/stp/max.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k1k/disagg/stp/max.yaml index 6d219cc1e..a7da42825 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k1k/disagg/stp/max.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k1k/disagg/stp/max.yaml @@ -162,10 +162,17 @@ backend: cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768] cuda-graph-max-bs: 768 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: [2048,4096] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "24" + TOTAL_GPUS: "72" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k1k/disagg/stp/mid.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k1k/disagg/stp/mid.yaml index b085f50f8..6c367ebf3 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k1k/disagg/stp/mid.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k1k/disagg/stp/mid.yaml @@ -162,10 +162,17 @@ backend: cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768] cuda-graph-max-bs: 768 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: [128,256,512,1024] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "32" + TOTAL_GPUS: "72" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml index 989fc47d1..76f03d343 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml @@ -106,9 +106,16 @@ backend: speculative-eagle-topk: 1 speculative-num-draft-tokens: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1x2x4x8x16x32x64" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "16" + DECODE_GPUS: "16" + TOTAL_GPUS: "32" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/mtp/h100-fp8-1p2d-max-tp-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/mtp/h100-fp8-1p2d-max-tp-mtp.yaml index 0ce17e8a4..3c6647c24 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/mtp/h100-fp8-1p2d-max-tp-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/mtp/h100-fp8-1p2d-max-tp-mtp.yaml @@ -108,9 +108,16 @@ backend: speculative-eagle-topk: 1 speculative-num-draft-tokens: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1x2x4x8x16x32x64x128" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "16" + DECODE_GPUS: "16" + TOTAL_GPUS: "48" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml index c47b6c867..dc186726c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml @@ -94,9 +94,16 @@ backend: max-running-requests: 64 cuda-graph-max-bs: 64 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1x2x4x8x16x32x64" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "16" + DECODE_GPUS: "16" + TOTAL_GPUS: "32" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/stp/h100-fp8-1p2d-max-tp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/stp/h100-fp8-1p2d-max-tp.yaml index 1f7cf9985..1e4b20c13 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/stp/h100-fp8-1p2d-max-tp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/stp/h100-fp8-1p2d-max-tp.yaml @@ -94,9 +94,16 @@ backend: max-running-requests: 128 cuda-graph-max-bs: 128 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1x2x4x8x16x32x64x128" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "16" + DECODE_GPUS: "16" + TOTAL_GPUS: "48" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml index 4a0448658..17b87aba7 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml @@ -108,9 +108,16 @@ backend: speculative-eagle-topk: 1 speculative-num-draft-tokens: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "1x2x4x8x16x32x64" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "16" + DECODE_GPUS: "16" + TOTAL_GPUS: "32" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/mtp/h100-fp8-1p1d-max-tp-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/mtp/h100-fp8-1p1d-max-tp-mtp.yaml index 591556df7..4dbe673c6 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/mtp/h100-fp8-1p1d-max-tp-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/mtp/h100-fp8-1p1d-max-tp-mtp.yaml @@ -108,9 +108,16 @@ backend: speculative-eagle-topk: 1 speculative-num-draft-tokens: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "1x2x4x8x16x32x64x128" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "16" + DECODE_GPUS: "16" + TOTAL_GPUS: "32" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml index 6c8a1c956..dc186726c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml @@ -94,9 +94,16 @@ backend: max-running-requests: 64 cuda-graph-max-bs: 64 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "1x2x4x8x16x32x64" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "16" + DECODE_GPUS: "16" + TOTAL_GPUS: "32" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/stp/h100-fp8-1p1d-max-tp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/stp/h100-fp8-1p1d-max-tp.yaml index 196e781df..120b9270c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/stp/h100-fp8-1p1d-max-tp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/stp/h100-fp8-1p1d-max-tp.yaml @@ -94,9 +94,16 @@ backend: max-running-requests: 128 cuda-graph-max-bs: 128 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "1x2x4x8x16x32x64x128" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "16" + DECODE_GPUS: "16" + TOTAL_GPUS: "32" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/mtp/bs256-1p6d-dep-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/mtp/bs256-1p6d-dep-mtp.yaml index 2c6539c93..d9177b2e1 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/mtp/bs256-1p6d-dep-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/mtp/bs256-1p6d-dep-mtp.yaml @@ -113,9 +113,16 @@ backend: speculative-eagle-topk: 1 speculative-num-draft-tokens: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "128x256x512x1024x2048" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "56" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/mtp/bs256-1p6d-tp-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/mtp/bs256-1p6d-tp-mtp.yaml index f2fc08020..bbdea98a4 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/mtp/bs256-1p6d-tp-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/mtp/bs256-1p6d-tp-mtp.yaml @@ -109,10 +109,16 @@ backend: speculative-eagle-topk: 1 speculative-num-draft-tokens: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - # concurrencies: "128x256x512" - concurrencies: "512x1024x2048" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "56" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/mtp/low-latency-1p9d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/mtp/low-latency-1p9d-mtp.yaml index 5d6e66ebb..2569666c2 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/mtp/low-latency-1p9d-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/mtp/low-latency-1p9d-mtp.yaml @@ -108,9 +108,16 @@ backend: speculative-eagle-topk: 1 speculative-num-draft-tokens: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1x4x8x16x32x64x128x256" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "80" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/stp/bs256-1p6d-dep.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/stp/bs256-1p6d-dep.yaml index 1932dc222..0d098c736 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/stp/bs256-1p6d-dep.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/stp/bs256-1p6d-dep.yaml @@ -100,10 +100,17 @@ backend: max-running-requests: 512 cuda-graph-max-bs: 512 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "128x256x512x1024x2048" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "56" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/stp/bs256-1p6d-tp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/stp/bs256-1p6d-tp.yaml index 05afea199..af5aded2c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/stp/bs256-1p6d-tp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/stp/bs256-1p6d-tp.yaml @@ -99,11 +99,17 @@ backend: max-running-requests: 512 cuda-graph-max-bs: 512 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - # concurrencies: "128x256x512" - concurrencies: "512x1024x2048" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "56" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/stp/low-latency-1p9d.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/stp/low-latency-1p9d.yaml index e60102aae..9cfc153f2 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/stp/low-latency-1p9d.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/stp/low-latency-1p9d.yaml @@ -98,9 +98,16 @@ backend: max-running-requests: 256 cuda-graph-max-bs: 256 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1x4x8x16x32x64x128x256" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "80" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs128-1p1d-dep-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs128-1p1d-dep-mtp.yaml index 4d62e5a04..292289a7e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs128-1p1d-dep-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs128-1p1d-dep-mtp.yaml @@ -110,9 +110,16 @@ backend: speculative-eagle-topk: 1 speculative-num-draft-tokens: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "32x64x128x256x512" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "16" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs16-1p3d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs16-1p3d-mtp.yaml index 97ea49b9a..76d9f6b1f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs16-1p3d-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs16-1p3d-mtp.yaml @@ -108,9 +108,16 @@ backend: speculative-eagle-topk: 1 speculative-num-draft-tokens: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "4x8x16x32x64" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "32" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs4-1p7d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs4-1p7d-mtp.yaml index d58d55b1b..01a278260 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs4-1p7d-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs4-1p7d-mtp.yaml @@ -108,9 +108,16 @@ backend: speculative-eagle-topk: 1 speculative-num-draft-tokens: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "1x4x8" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs64-2p3d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs64-2p3d-mtp.yaml index ed1232d16..e426c78ba 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs64-2p3d-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs64-2p3d-mtp.yaml @@ -110,12 +110,19 @@ backend: speculative-eagle-topk: 1 speculative-num-draft-tokens: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "32x64x128" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "40" # benchmark: # type: "gpqa" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs8-1p6d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs8-1p6d-mtp.yaml index 5bd83fa5c..2922ba1df 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs8-1p6d-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs8-1p6d-mtp.yaml @@ -109,9 +109,16 @@ backend: speculative-eagle-topk: 1 speculative-num-draft-tokens: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "2x4x8x16x32" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "56" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs128-1p1d-dep.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs128-1p1d-dep.yaml index d131f6b02..e86438436 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs128-1p1d-dep.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs128-1p1d-dep.yaml @@ -100,10 +100,17 @@ backend: max-running-requests: 256 cuda-graph-max-bs: 256 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "64x128x256" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "16" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs16-1p3d.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs16-1p3d.yaml index 576ff2a03..75e36493b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs16-1p3d.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs16-1p3d.yaml @@ -98,10 +98,17 @@ backend: max-running-requests: 32 cuda-graph-max-bs: 32 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "8x16x32" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "32" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs4-1p7d.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs4-1p7d.yaml index 78ce3d5a1..56aa58d11 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs4-1p7d.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs4-1p7d.yaml @@ -98,10 +98,17 @@ backend: max-running-requests: 8 cuda-graph-max-bs: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "1x4x8" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs64-2p3d.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs64-2p3d.yaml index 73aaacc30..7c876e3cf 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs64-2p3d.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs64-2p3d.yaml @@ -100,12 +100,19 @@ backend: max-running-requests: 128 cuda-graph-max-bs: 128 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "32x64x128" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "40" # benchmark: # type: "gpqa" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs8-1p6d.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs8-1p6d.yaml index c37c50eea..5eeba8f61 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs8-1p6d.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs8-1p6d.yaml @@ -99,10 +99,17 @@ backend: max-running-requests: 16 cuda-graph-max-bs: 16 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "4x8x16" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + MODEL_NAME: "deepseek-ai/DeepSeek-R1" + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "56" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml index 9532b9cc5..6b34b2fb7 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml @@ -102,12 +102,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "875" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "44" frontend: nginx_container: "nginx-sqsh" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml index 31bf5bf20..4445c953b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml @@ -97,12 +97,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "6" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "44" frontend: nginx_container: "nginx-sqsh" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml index 3a3309f56..b7d1c9260 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml @@ -111,12 +111,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "10x15x25x45x90x180" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "44" frontend: nginx_container: "nginx-sqsh" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml index 90ad2c657..d5def7a35 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml @@ -105,12 +105,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 1 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4968" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "44" frontend: nginx_container: "nginx-sqsh" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml index 31adc6239..dde552b51 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml @@ -111,12 +111,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 1 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "10860" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "4" + TOTAL_GPUS: "32" frontend: nginx_container: "nginx-sqsh" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml index 6c3e4bf80..275c140a5 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml @@ -102,12 +102,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4096" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "12" frontend: nginx_container: "nginx-sqsh" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml index 56746330e..ae7ba8483 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml @@ -99,12 +99,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "2192" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "20" frontend: nginx_container: "nginx-sqsh" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml index 0fde29f21..16961a5e0 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml @@ -97,12 +97,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1365" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "44" frontend: nginx_container: "nginx-sqsh" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml index 4612b7c2c..ac84ded85 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml @@ -91,12 +91,18 @@ backend: num_postprocess_workers: 4 allreduce_strategy: MNNVL +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "6" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "44" frontend: nginx_container: "nginx-sqsh" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml index 53e833b75..930f2520f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml @@ -112,12 +112,18 @@ backend: num_postprocess_workers: 4 allreduce_strategy: MNNVL +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "10x15x25x45x90x180" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "44" frontend: nginx_container: "nginx-sqsh" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml index 47c2c6e22..d90c6f3b0 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml @@ -101,12 +101,18 @@ backend: num_postprocess_workers: 4 allreduce_strategy: MNNVL +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "450" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "52" frontend: nginx_container: "nginx-sqsh" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml index a1ec4f38d..1017f8feb 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml @@ -101,12 +101,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "90" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "12" frontend: nginx_container: "nginx-sqsh" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml index 48aad03b6..4c919e2e1 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml @@ -108,12 +108,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "66" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "28" frontend: nginx_container: "nginx-sqsh" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml index 559841f73..dec75f377 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml @@ -97,12 +97,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "6" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "44" frontend: nginx_container: "nginx-sqsh" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml index f9d9843f6..1c8582c31 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml @@ -104,12 +104,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "10x15x30x60" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "44" frontend: nginx_container: "nginx-sqsh" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml index 7e06d12b5..37ab36d1f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml @@ -105,12 +105,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "548" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "20" frontend: nginx_container: "nginx-sqsh" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml index 96b4d97c5..693c2221c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml @@ -109,12 +109,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 1 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "1096x1691" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "28" frontend: nginx_container: "nginx-sqsh" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml index 98229c7bf..ffbc9ae61 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml @@ -104,12 +104,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "658" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "36" frontend: nginx_container: "nginx-sqsh" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml index 762987f6e..b2c967541 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml @@ -92,12 +92,18 @@ backend: allreduce_strategy: MNNVL +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "6" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "44" frontend: nginx_container: "nginx-sqsh" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml index a03114f95..0f88bb006 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml @@ -105,12 +105,18 @@ backend: num_postprocess_workers: 4 allreduce_strategy: MNNVL +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "10x15x25x50x100" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "44" frontend: nginx_container: "nginx-sqsh" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml index 4dfe07604..738dd82ea 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml @@ -100,12 +100,18 @@ backend: num_postprocess_workers: 4 allreduce_strategy: MNNVL +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "370" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "48" frontend: nginx_container: "nginx-sqsh" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml index 23c2db5d8..22681d23a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml @@ -103,12 +103,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "1606" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "24" frontend: nginx_container: "nginx-sqsh" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml index e94326803..6e233467a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml @@ -96,12 +96,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "837" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "40" frontend: nginx_container: "nginx-sqsh" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml index b3c9e1300..99f0ea58f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml @@ -99,12 +99,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "2222" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "44" frontend: nginx_container: "nginx-sqsh" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml index 8c7cf706d..0fbd25b82 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml @@ -100,12 +100,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: [1600] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "24" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml index dd06e8462..fe3ab4c6c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml @@ -100,12 +100,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: [1184] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "32" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml index d41d81458..ab8b4d1c6 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml @@ -100,12 +100,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: [1024] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "40" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml index 3b4193e44..a2665a5a4 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml @@ -100,12 +100,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: [896] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "64" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml index de08fe729..057fcbd77 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml @@ -100,12 +100,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: [8] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "72" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml index 0b67948c3..e42404618 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml @@ -100,12 +100,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: [256] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "72" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml index a79351e20..042c00923 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml @@ -100,12 +100,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: [32] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "72" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml index 1814ff355..9ad27278a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml @@ -100,12 +100,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: [64] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "72" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml index 2e0ac949f..65aeecbfa 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml @@ -94,12 +94,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: [4096] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "16" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml index 47008c9f0..6159a29ad 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml @@ -94,12 +94,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: [128] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "32" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml index aa2d8c6f2..58d800b6a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml @@ -94,12 +94,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: [32] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "32" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml index b9829e22f..0ed6396a0 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml @@ -94,12 +94,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: [4] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "32" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml index 56df5bad2..875279c47 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml @@ -94,12 +94,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: [1920] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "48" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml index a412a6419..c277966c4 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml @@ -94,12 +94,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: [5152] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "56" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml index 2ccfffba7..7f03ae1e3 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml @@ -102,12 +102,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: [8] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "24" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml index a9ad0a7d9..712a67416 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml @@ -102,12 +102,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: [64] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "40" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml index 38b12e6c0..4212abd06 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml @@ -102,12 +102,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: [48] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "56" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml index 3b38311b7..f3e356085 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml @@ -102,12 +102,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: [8] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "56" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml index 378123831..cda4cecfd 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml @@ -104,12 +104,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: [288] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "24" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml index a26eaf4f1..1cdb3af76 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml @@ -104,12 +104,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: [224] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "40" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml index 3c659d4dc..359073927 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml @@ -104,12 +104,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: [1088] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "40" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml index 6c383e60e..7a9a20391 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml @@ -94,12 +94,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: [128] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "16" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml index 7821ab79e..3f93f9140 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml @@ -94,12 +94,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: [256] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "16" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml index 0f2fdd949..ca1c1d60f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml @@ -96,12 +96,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: [1] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "16" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml index 305c15124..6b03210e3 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml @@ -94,12 +94,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: [128] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "24" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml index 3c64aacf5..38ed548da 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml @@ -95,12 +95,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: [128] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "40" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml index 751bdd585..f086c23c0 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml @@ -95,12 +95,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: [32] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "40" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml index cb4c4d8a3..39f1bffd8 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml @@ -95,12 +95,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: [96] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "56" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml index db804a6b6..2b787d7f4 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml @@ -94,12 +94,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: [640] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "24" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml index 36b365a7d..554db4ec4 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml @@ -112,12 +112,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "654" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "8" + TOTAL_GPUS: "10" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml index f2cd900c9..497739ac7 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml @@ -110,12 +110,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "271" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "8" + TOTAL_GPUS: "18" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml index 31bae1596..0fbaeb745 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml @@ -108,12 +108,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "11" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "8" + TOTAL_GPUS: "42" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml index eeb43290a..2d9df253b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml @@ -124,12 +124,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "10x20x25x60x120x200" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "8" + TOTAL_GPUS: "42" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml index 7f8b9ae4a..c356b1b19 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml @@ -114,12 +114,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 1 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "2342" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "8" + TOTAL_GPUS: "12" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml index 98d8ab04d..5735ea337 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml @@ -115,12 +115,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 1 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "8609" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "8" + TOTAL_GPUS: "26" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml index a81e980ec..1eed2b318 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml @@ -116,12 +116,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 1 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "12926" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "8" + TOTAL_GPUS: "26" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml index 13978a422..7d11fb152 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml @@ -106,12 +106,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1176" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "8" + TOTAL_GPUS: "18" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml index 5885277d0..458ce824d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml @@ -102,12 +102,18 @@ backend: num_postprocess_workers: 4 allreduce_strategy: MNNVL +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "6" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "8" + TOTAL_GPUS: "34" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml index 9d73c7308..3e493c98e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml @@ -106,12 +106,18 @@ backend: num_postprocess_workers: 4 allreduce_strategy: MNNVL +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "5x10x15x25" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "4" + TOTAL_GPUS: "22" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml index 92b99de35..adb4a8b79 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml @@ -121,12 +121,18 @@ backend: num_postprocess_workers: 4 allreduce_strategy: MNNVL +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "60x110x195x395" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "8" + TOTAL_GPUS: "42" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml index 3113744c9..8bd76075a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml @@ -109,12 +109,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4405" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "8" + TOTAL_GPUS: "12" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml index d74782639..76d4cd780 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml @@ -114,12 +114,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "8192" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "8" + TOTAL_GPUS: "14" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml index 5088b566c..3c0692530 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml @@ -108,12 +108,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4611" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "8" + TOTAL_GPUS: "22" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml index c24f57918..5f522818a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml @@ -114,12 +114,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 1 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "2198" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "8" + TOTAL_GPUS: "28" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml index 7e2ab395a..41f443c22 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml @@ -112,12 +112,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "52" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "4" + TOTAL_GPUS: "18" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml index 83c7af6ad..ff3bca726 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml @@ -108,12 +108,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "8" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "8" + TOTAL_GPUS: "34" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml index 723029b8d..87c3c57b6 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml @@ -111,12 +111,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "32" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "8" + TOTAL_GPUS: "34" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml index 67e9fc568..3f40345ca 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml @@ -110,12 +110,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "181" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "8" + TOTAL_GPUS: "14" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml index b0494f78f..a52be413d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml @@ -113,12 +113,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 1 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "1197" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "8" + TOTAL_GPUS: "26" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml index 5bc38c22a..f515e9aba 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml @@ -108,12 +108,18 @@ backend: num_postprocess_workers: 4 allreduce_strategy: MNNVL +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "105" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "4" + TOTAL_GPUS: "14" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml index 002aa9e27..7a167eb80 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml @@ -106,12 +106,18 @@ backend: num_postprocess_workers: 4 allreduce_strategy: MNNVL +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "63" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "8" + TOTAL_GPUS: "26" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml index 5e8d96a80..36a6268eb 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml @@ -115,12 +115,18 @@ backend: num_postprocess_workers: 4 allreduce_strategy: MNNVL +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "4" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "8" + TOTAL_GPUS: "26" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml index df7612f99..d184a95d5 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml @@ -103,12 +103,18 @@ backend: num_postprocess_workers: 4 allreduce_strategy: MNNVL +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "12" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "4" + TOTAL_GPUS: "18" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml index b791d44b8..bacd57645 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml @@ -105,12 +105,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "589" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "8" + TOTAL_GPUS: "26" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml index 09b89137c..923b32c05 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml @@ -113,12 +113,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "1093" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "8" + TOTAL_GPUS: "20" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml index 0ca0d7692..1173417cc 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml @@ -113,12 +113,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "2048" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "8" + TOTAL_GPUS: "24" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml index cfa58f2a3..9e1da3cf3 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml @@ -112,12 +112,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: [3072] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "12" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml index 866ccbb8e..d1ccc8b44 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml @@ -112,12 +112,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: [2560] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "20" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml index 4e7600a2c..74802bbc7 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml @@ -112,12 +112,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: [720] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "44" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml index a00639e26..4a09efd68 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml @@ -113,12 +113,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: [160] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "68" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml index 62ae3984f..a6cbb9b66 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml @@ -113,12 +113,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: [10] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "68" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml index 957676992..7ccdfa4af 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml @@ -112,12 +112,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: [11264] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "28" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml index f41079a54..fa0675ade 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml @@ -106,12 +106,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: [2112] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "12" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml index 7746b638c..121844730 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml @@ -106,12 +106,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: [3072] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "20" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml index bdaef8f3e..7a7b2e1fe 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml @@ -106,12 +106,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: [1280] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "28" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml index f469bf3bc..0e75f3747 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml @@ -107,12 +107,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: [10] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "68" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml index b3b2d8740..384ef6e0c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml @@ -107,12 +107,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: [128] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "68" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml index 36476736b..5fb7781d4 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml @@ -107,12 +107,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: [384] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "68" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml index c9d131239..364b538d6 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml @@ -106,12 +106,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: [16384] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "16" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml index 7e806469c..1039c9e2c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml @@ -112,12 +112,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: [72] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "12" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml index c203b724a..89a1abdd3 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml @@ -113,12 +113,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: [40] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "20" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml index 48773bf14..87ad50002 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml @@ -113,12 +113,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: [5] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "36" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml index bba0d5a65..4edbcf88d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml @@ -113,12 +113,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: [20] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "36" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml index 9511ede04..7eba0cdd6 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml @@ -112,12 +112,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: [144] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "16" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml index 7513770d8..555ec7688 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml @@ -112,12 +112,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: [512] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "24" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml index 2852df6c3..8c9160c66 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml @@ -107,12 +107,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: [64] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "36" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml index 68ae8f4dc..54de6c71f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml @@ -107,12 +107,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: [10] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "68" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml index 1c2977396..4e7808183 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml @@ -106,12 +106,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: [256] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "16" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml index 343b25905..6d6573b24 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml @@ -106,12 +106,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: [512] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "20" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml index 5aa5546ab..dd915b01d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml @@ -107,12 +107,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: [256] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "52" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml index df8c2831c..1e0375787 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml @@ -106,12 +106,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: [1075] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "28" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml index 9b0df56e9..eb6170f6a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml @@ -106,12 +106,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: [3072] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "36" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml index a8f90e9bd..f6cb09bbc 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml @@ -99,12 +99,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "180" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "32" + TOTAL_GPUS: "36" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml index be4f29045..aa711f76c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml @@ -103,12 +103,18 @@ backend: speculative_config: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4x8x12x24x48" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "36" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml index 5dd8a302b..50a8aa6c4 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml @@ -134,12 +134,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 1 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4301" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "16" + TOTAL_GPUS: "24" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml index 08fc612ec..53fae254f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml @@ -110,12 +110,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 1 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "2253" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "32" + TOTAL_GPUS: "44" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml index 44a05c484..507a15f85 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml @@ -195,12 +195,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 1 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "16130" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "4" + TOTAL_GPUS: "32" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml index c353c3df0..24294befe 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml @@ -95,12 +95,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "666" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "32" + TOTAL_GPUS: "36" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml index a62b540d9..67fd9d9a4 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml @@ -157,12 +157,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4301" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "12" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml index d56eba13c..57be7c35e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml @@ -189,12 +189,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "6144" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "4" + TOTAL_GPUS: "12" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml index 94a45661b..e8794eae8 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml @@ -92,12 +92,18 @@ backend: num_postprocess_workers: 4 allreduce_strategy: MNNVL +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "5" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "36" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml index a93c86f82..e9d59aaab 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml @@ -107,12 +107,18 @@ backend: num_postprocess_workers: 4 allreduce_strategy: MNNVL +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "12x24x48x96x192" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "36" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml index 9aa57eb46..c752a5600 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml @@ -128,12 +128,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4301" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "16" + TOTAL_GPUS: "24" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml index 3501708c2..118580aa9 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml @@ -101,12 +101,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "2253" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "32" + TOTAL_GPUS: "40" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml index 0a88341a1..0ccf95443 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml @@ -134,12 +134,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 1 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "4301" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "16" + TOTAL_GPUS: "60" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml index b4dd6005d..2854854f2 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml @@ -105,12 +105,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "4x8x12x24x48" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "36" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml index 9374538f8..bddcf060e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml @@ -99,12 +99,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "180" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "32" + TOTAL_GPUS: "44" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml index a62e4f24f..eb101a191 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml @@ -110,12 +110,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 1 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "1229" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "16" + TOTAL_GPUS: "44" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml index ee3082fe5..3bf47d0a8 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml @@ -101,12 +101,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "666" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "32" + TOTAL_GPUS: "64" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml index 4df408491..7cfee6b2e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml @@ -128,12 +128,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "4096" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "16" + TOTAL_GPUS: "56" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml index 4b603ad67..a7e491533 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml @@ -103,12 +103,18 @@ backend: num_postprocess_workers: 4 allreduce_strategy: MNNVL +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "12x44x76" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "36" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml index 1ee953844..fa6483998 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml @@ -94,12 +94,18 @@ backend: +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "5" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "36" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml index b08791f00..c0d6dc3f3 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml @@ -94,12 +94,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "333" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "32" + TOTAL_GPUS: "40" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml index 7f4e9594e..b78f93a10 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml @@ -97,12 +97,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "1229" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "32" + TOTAL_GPUS: "60" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml index 059688716..080186d0f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml @@ -109,12 +109,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "2253" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "16" + TOTAL_GPUS: "48" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml index ba7f2ff21..6ea81b176 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml @@ -105,12 +105,18 @@ backend: tensor_parallel_size: 16 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: ['1229'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "16" + TOTAL_GPUS: "24" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml index 218b85744..8e5f86356 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml @@ -99,12 +99,18 @@ backend: tensor_parallel_size: 32 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: ['615'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "32" + TOTAL_GPUS: "40" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml index fe49d8959..a96a862ef 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml @@ -129,12 +129,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: ['2151'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "16" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml index 25847ed23..449ca1d85 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml @@ -161,12 +161,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: ['4301'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "16" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml index 62d4be838..e6f72bd07 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml @@ -98,12 +98,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: ['9'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "32" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml index 47f21d46b..519f5da0c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml @@ -98,12 +98,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: ['18'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "32" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml index ecb7c92cd..23c1180d5 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml @@ -99,12 +99,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: ['36'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "32" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml index 47b869af5..868c65032 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml @@ -107,12 +107,18 @@ backend: tensor_parallel_size: 16 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: ['2151'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "16" + TOTAL_GPUS: "24" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml index d1e3cae50..64f1004f5 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml @@ -95,12 +95,18 @@ backend: tensor_parallel_size: 32 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: ['1127'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "32" + TOTAL_GPUS: "40" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml index c48edbd5f..05f3d0763 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml @@ -92,12 +92,18 @@ backend: tensor_parallel_size: 32 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: ['256'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "32" + TOTAL_GPUS: "40" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml index 08139cf82..5fcaf989c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml @@ -155,12 +155,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: ['4301'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "16" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml index 14b33599c..5f54ed0f7 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml @@ -187,12 +187,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: ['6144'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "16" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml index 2b9250430..801c5214a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml @@ -92,12 +92,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: ['3'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "32" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml index 160f4c6ca..9c57a2897 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml @@ -93,12 +93,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: ['27'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "32" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml index 8f305ced0..12632ffd1 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml @@ -98,12 +98,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: ['6'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "32" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml index bea950ac7..a80c790f9 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml @@ -98,12 +98,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: ['15'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "32" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml index fbf861990..1f108d424 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml @@ -97,12 +97,18 @@ backend: tensor_parallel_size: 32 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: ['90'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "32" + TOTAL_GPUS: "48" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml index ea8a7d013..08f63213f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml @@ -99,12 +99,18 @@ backend: tensor_parallel_size: 16 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: ['333'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "16" + TOTAL_GPUS: "40" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml index 2ad2e727d..982765ae5 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml @@ -105,12 +105,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: ['666'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "32" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml index 95bf6192f..6b286ce2e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml @@ -98,12 +98,18 @@ backend: tensor_parallel_size: 32 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: ['333'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "32" + TOTAL_GPUS: "64" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml index 35da2b70f..9bc424961 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml @@ -101,12 +101,18 @@ backend: tensor_parallel_size: 16 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: ['666'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "16" + TOTAL_GPUS: "56" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml index 178a3b7df..0430ce4b1 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml @@ -94,12 +94,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: ['63'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "32" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml index f33813fd9..d1b526a07 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml @@ -92,12 +92,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: ['6'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "32" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml index 98aee313b..fdf1e856c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml @@ -92,12 +92,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: ['18'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "32" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml index 816065639..2dffe83f1 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml @@ -92,12 +92,18 @@ backend: tensor_parallel_size: 32 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: ['333'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "32" + TOTAL_GPUS: "48" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml index f7d87c1b3..ba7c6142f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml @@ -95,12 +95,18 @@ backend: tensor_parallel_size: 16 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: ['615'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "16" + TOTAL_GPUS: "40" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml index 27a19e5b8..8675bf58d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml @@ -93,12 +93,18 @@ backend: tensor_parallel_size: 32 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: ['666'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "32" + TOTAL_GPUS: "64" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml index 634f07cdb..ca9b432d0 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml @@ -99,12 +99,18 @@ backend: tensor_parallel_size: 16 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: ['1229'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "16" + TOTAL_GPUS: "56" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml index b4434cdda..b3d1dd62a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml @@ -103,12 +103,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "333" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "32" + TOTAL_GPUS: "34" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml index e264a1796..2b9d42408 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml @@ -198,12 +198,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 1 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "3226" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "4" + TOTAL_GPUS: "6" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml index 67c672ffb..c2c4c537a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml @@ -101,12 +101,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "5" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "8" + TOTAL_GPUS: "34" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml index aab184727..da70d4074 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml @@ -106,12 +106,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "8x12x24x48" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "8" + TOTAL_GPUS: "34" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml index 58cbacdf4..12174174c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml @@ -121,12 +121,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 1 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "2253" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "16" + TOTAL_GPUS: "22" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml index 698989630..502ae7cf2 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml @@ -109,12 +109,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1229" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "32" + TOTAL_GPUS: "38" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml index 642aa6c43..cba8a4f64 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml @@ -95,12 +95,18 @@ backend: num_postprocess_workers: 4 allreduce_strategy: MNNVL +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "5" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "8" + TOTAL_GPUS: "34" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml index 44774b6bc..794556055 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml @@ -109,12 +109,18 @@ backend: num_postprocess_workers: 4 allreduce_strategy: MNNVL +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "12x48x96x192" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "8" + TOTAL_GPUS: "34" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml index ffc2850fb..8249a5369 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml @@ -99,12 +99,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1229" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "32" + TOTAL_GPUS: "36" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml index 28e148d02..5f96315ff 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml @@ -223,12 +223,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "8192" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "8" + TOTAL_GPUS: "12" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml index 4d4ffe594..50f4f8f0f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml @@ -131,12 +131,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4301" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "16" + TOTAL_GPUS: "22" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml index de841c92c..9acddc31e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml @@ -104,12 +104,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "2253" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "32" + TOTAL_GPUS: "38" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml index 7bf2a9332..4d258c289 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml @@ -105,12 +105,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "666" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "16" + TOTAL_GPUS: "36" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml index 09710a97d..c10a8598b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml @@ -133,12 +133,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 1 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "2253" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "8" + TOTAL_GPUS: "28" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml index 61988358c..df0375f0e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml @@ -113,12 +113,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "1127" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "16" + TOTAL_GPUS: "42" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml index f07f607ea..6ce834ce3 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml @@ -104,12 +104,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "33" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "8" + TOTAL_GPUS: "26" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml index be9842323..53771a342 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml @@ -101,12 +101,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "5" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "8" + TOTAL_GPUS: "34" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml index 5d45c06d3..b2349f421 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml @@ -104,12 +104,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "12x24" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "8" + TOTAL_GPUS: "34" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml index c0c4f66e7..ddd5641a9 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml @@ -101,12 +101,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "180" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "32" + TOTAL_GPUS: "40" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml index e719310a4..aaca79561 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml @@ -102,12 +102,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "308" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "32" + TOTAL_GPUS: "48" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml index 6b6f4a36e..f141a5005 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml @@ -128,12 +128,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "3228" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "4" + TOTAL_GPUS: "34" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml index 42523722e..882083834 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml @@ -111,12 +111,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "2253" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "16" + TOTAL_GPUS: "44" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml index 34678b650..e4568f7e1 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml @@ -99,12 +99,18 @@ backend: num_postprocess_workers: 4 allreduce_strategy: MNNVL +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "72" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "8" + TOTAL_GPUS: "26" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml index 158dd4ed9..5a6e21737 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml @@ -95,12 +95,18 @@ backend: num_postprocess_workers: 4 allreduce_strategy: MNNVL +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "5" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "8" + TOTAL_GPUS: "34" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml index f2f18332c..4b8ad5a43 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml @@ -96,12 +96,18 @@ backend: num_postprocess_workers: 4 allreduce_strategy: MNNVL +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "12" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "8" + TOTAL_GPUS: "34" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml index f380710f8..6f6194a84 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml @@ -96,12 +96,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "5x15x30" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "4" + TOTAL_GPUS: "22" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml index 8dbb94ea5..f68b83534 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml @@ -98,12 +98,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "666" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "32" + TOTAL_GPUS: "46" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml index eba48a69c..db6ae1b3f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml @@ -104,12 +104,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "1229" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "2" + DECODE_GPUS: "16" + TOTAL_GPUS: "34" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml index fd4c842d5..f03320ce7 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml @@ -103,12 +103,18 @@ backend: tensor_parallel_size: 16 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: ['666'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "16" + TOTAL_GPUS: "20" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml index 24cc7fcb2..3783dd563 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml @@ -99,12 +99,18 @@ backend: tensor_parallel_size: 32 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: ['180'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "32" + TOTAL_GPUS: "36" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml index dd886c1c6..d4cf77025 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml @@ -100,12 +100,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: ['8'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "36" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml index 6625fde5d..e6d895550 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml @@ -100,12 +100,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: ['24'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "36" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml index 14b8c83ec..f178dc30a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml @@ -115,12 +115,18 @@ backend: tensor_parallel_size: 16 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: ['2253'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "16" + TOTAL_GPUS: "24" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml index 30335f8e4..562ada512 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml @@ -101,12 +101,18 @@ backend: tensor_parallel_size: 32 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: ['564'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "32" + TOTAL_GPUS: "40" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml index 5985d197c..87ba559b2 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml @@ -163,12 +163,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: ['8192'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "28" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml index 5d74bf4f0..57803a156 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml @@ -96,12 +96,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: ['84'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "36" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml index 9b51b74ce..3f3905468 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml @@ -94,12 +94,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: ['4'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "36" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml index bc0a9ad4a..6e2ba5e8e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml @@ -94,12 +94,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: ['24'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "36" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml index 126e651e1..2580bab99 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml @@ -109,12 +109,18 @@ backend: tensor_parallel_size: 16 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: ['2253'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "16" + TOTAL_GPUS: "24" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml index f66062760..c7dc2dcdd 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml @@ -97,12 +97,18 @@ backend: tensor_parallel_size: 32 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: ['1229'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "32" + TOTAL_GPUS: "40" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml index 68a326b76..c4613dbb2 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml @@ -157,12 +157,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: ['8602'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "28" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml index 8cd72351d..bdc07bf9d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml @@ -189,12 +189,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: ['12288'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "28" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml index 6123b194f..95a1bd02e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml @@ -107,12 +107,18 @@ backend: tensor_parallel_size: 16 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: ['1229'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "16" + TOTAL_GPUS: "56" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml index 3c61eca96..644b5a20b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml @@ -100,12 +100,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: ['8'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "36" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml index 539a3f780..5c7a8ed5c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml @@ -100,12 +100,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: ['24'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "36" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml index 49e94caa5..c78705873 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml @@ -100,12 +100,18 @@ backend: tensor_parallel_size: 32 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: ['333'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "32" + TOTAL_GPUS: "56" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml index e531467ca..e00287de7 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml @@ -115,12 +115,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: ['1229'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "36" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml index fadb3c8c1..162f003e4 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml @@ -103,12 +103,18 @@ backend: tensor_parallel_size: 16 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: ['666'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "16" + TOTAL_GPUS: "48" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml index 30ba58dcd..3a470113e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml @@ -94,12 +94,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: ['4'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "36" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml index 091164082..8b14ffd93 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml @@ -94,12 +94,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: ['24'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "36" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml index de8d408d1..f5994c054 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml @@ -95,12 +95,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: ['36'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "36" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml index 70aade3de..fcf7292da 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml @@ -97,12 +97,18 @@ backend: tensor_parallel_size: 16 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: ['666'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "16" + TOTAL_GPUS: "32" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml index cfe8dead6..ac8d6faa6 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml @@ -95,12 +95,18 @@ backend: tensor_parallel_size: 32 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: ['512'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "32" + TOTAL_GPUS: "56" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml index 97745e8c8..e585cc065 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml @@ -101,12 +101,18 @@ backend: tensor_parallel_size: 16 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: ['1229'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "16" + TOTAL_GPUS: "44" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml index 09e23abed..87272ba14 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml @@ -125,12 +125,18 @@ backend: tensor_parallel_size: 8 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: ['2151'] - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "36" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml index 104f3b4ab..67da71d3d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml @@ -92,12 +92,18 @@ backend: speculative_config: decoding_type: MTP num_nextn_predict_layers: 2 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 1024 - osl: 1024 - concurrencies: '615' - req_rate: inf + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "16" + DECODE_GPUS: "16" + TOTAL_GPUS: "32" frontend: type: dynamo enable_multiple_frontends: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml index 4c41ec82a..766d7fd79 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml @@ -96,12 +96,18 @@ backend: speculative_config: decoding_type: MTP num_nextn_predict_layers: 1 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 1024 - osl: 1024 - concurrencies: '1229' - req_rate: inf + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "16" + DECODE_GPUS: "16" + TOTAL_GPUS: "32" frontend: type: dynamo enable_multiple_frontends: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml index c3dc14082..d2e17ac7a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml @@ -88,12 +88,18 @@ backend: speculative_config: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 1024 - osl: 1024 - concurrencies: '231' - req_rate: inf + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "16" + DECODE_GPUS: "16" + TOTAL_GPUS: "64" frontend: type: dynamo enable_multiple_frontends: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml index 8f3663c94..a48f9c94a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml @@ -101,12 +101,18 @@ backend: speculative_config: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 1024 - osl: 1024 - concurrencies: '462' - req_rate: inf + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "16" + DECODE_GPUS: "16" + TOTAL_GPUS: "64" frontend: type: dynamo enable_multiple_frontends: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml index bd77671ac..c07b82fad 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml @@ -87,12 +87,18 @@ backend: speculative_config: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 1024 - osl: 1024 - concurrencies: '60' - req_rate: inf + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "16" + DECODE_GPUS: "16" + TOTAL_GPUS: "64" frontend: type: dynamo enable_multiple_frontends: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml index c1fccbc9d..d64e9777c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml @@ -85,12 +85,18 @@ backend: speculative_config: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 1024 - osl: 1024 - concurrencies: '6' - req_rate: inf + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "16" + DECODE_GPUS: "16" + TOTAL_GPUS: "64" frontend: type: dynamo enable_multiple_frontends: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml index 15c71e8d3..077357b39 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml @@ -85,12 +85,18 @@ backend: speculative_config: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 1024 - osl: 1024 - concurrencies: '9' - req_rate: inf + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "16" + DECODE_GPUS: "16" + TOTAL_GPUS: "64" frontend: type: dynamo enable_multiple_frontends: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml index 4f261058e..414388c6b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml @@ -89,12 +89,18 @@ backend: speculative_config: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 1024 - osl: 1024 - concurrencies: '117' - req_rate: inf + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "16" + DECODE_GPUS: "16" + TOTAL_GPUS: "64" frontend: type: dynamo enable_multiple_frontends: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml index 07de7a34d..d49f37947 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml @@ -86,12 +86,18 @@ backend: speculative_config: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 1024 - osl: 1024 - concurrencies: '30' - req_rate: inf + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "16" + DECODE_GPUS: "16" + TOTAL_GPUS: "64" frontend: type: dynamo enable_multiple_frontends: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml index 4a55e5ed8..1624bcc3e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml @@ -84,12 +84,18 @@ backend: backend: UCX stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 1024 - osl: 1024 - concurrencies: '924' - req_rate: inf + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "16" + DECODE_GPUS: "16" + TOTAL_GPUS: "64" frontend: type: dynamo enable_multiple_frontends: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml index 2bedf4c23..f632508e1 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml @@ -86,12 +86,18 @@ backend: backend: UCX stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 1024 - osl: 1024 - concurrencies: '1845' - req_rate: inf + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "16" + DECODE_GPUS: "16" + TOTAL_GPUS: "64" frontend: type: dynamo enable_multiple_frontends: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml index 1ff9ace49..6cd4b7697 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml @@ -82,12 +82,18 @@ backend: backend: UCX stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 1024 - osl: 1024 - concurrencies: '231' - req_rate: inf + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "16" + DECODE_GPUS: "16" + TOTAL_GPUS: "64" frontend: type: dynamo enable_multiple_frontends: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml index 215e8a6bf..10ab482b3 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml @@ -83,12 +83,18 @@ backend: backend: UCX stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 1024 - osl: 1024 - concurrencies: '462' - req_rate: inf + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "16" + DECODE_GPUS: "16" + TOTAL_GPUS: "64" frontend: type: dynamo enable_multiple_frontends: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml index 4281abed2..850acc0da 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml @@ -81,12 +81,18 @@ backend: backend: UCX stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 1024 - osl: 1024 - concurrencies: '60' - req_rate: inf + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "16" + DECODE_GPUS: "16" + TOTAL_GPUS: "64" frontend: type: dynamo enable_multiple_frontends: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml index a0e0005e8..a1d5c9aac 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml @@ -79,12 +79,18 @@ backend: backend: UCX stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 1024 - osl: 1024 - concurrencies: '6' - req_rate: inf + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "16" + DECODE_GPUS: "16" + TOTAL_GPUS: "64" frontend: type: dynamo enable_multiple_frontends: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml index 6eee90d2d..c3b1144bd 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml @@ -79,12 +79,18 @@ backend: backend: UCX stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 1024 - osl: 1024 - concurrencies: '9' - req_rate: inf + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "16" + DECODE_GPUS: "16" + TOTAL_GPUS: "64" frontend: type: dynamo enable_multiple_frontends: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml index 29e634316..2e972e14b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml @@ -80,12 +80,18 @@ backend: backend: UCX stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 1024 - osl: 1024 - concurrencies: '30' - req_rate: inf + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "16" + DECODE_GPUS: "16" + TOTAL_GPUS: "64" frontend: type: dynamo enable_multiple_frontends: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml index bb02cdd0a..3dd8f5482 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml @@ -114,12 +114,18 @@ backend: backend: UCX stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 1024 - osl: 1024 - concurrencies: '4916' - req_rate: inf + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "16" + DECODE_GPUS: "16" + TOTAL_GPUS: "48" frontend: type: dynamo enable_multiple_frontends: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml index b78cb01af..007d7e4eb 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml @@ -88,12 +88,18 @@ backend: speculative_config: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 8192 - osl: 1024 - concurrencies: '77' - req_rate: inf + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "16" + DECODE_GPUS: "16" + TOTAL_GPUS: "32" frontend: type: dynamo enable_multiple_frontends: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml index dd0ddda85..ecf82c12b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml @@ -90,12 +90,18 @@ backend: speculative_config: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 8192 - osl: 1024 - concurrencies: '78' - req_rate: inf + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "16" + DECODE_GPUS: "16" + TOTAL_GPUS: "48" frontend: type: dynamo enable_multiple_frontends: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml index 2f0ef4e90..221dfc3f7 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml @@ -86,12 +86,18 @@ backend: speculative_config: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 8192 - osl: 1024 - concurrencies: '6' - req_rate: inf + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "16" + DECODE_GPUS: "16" + TOTAL_GPUS: "64" frontend: type: dynamo enable_multiple_frontends: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml index be3fc74ce..3b6a18fe6 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml @@ -86,12 +86,18 @@ backend: speculative_config: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 8192 - osl: 1024 - concurrencies: '9' - req_rate: inf + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "16" + DECODE_GPUS: "16" + TOTAL_GPUS: "64" frontend: type: dynamo enable_multiple_frontends: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml index 6a710bbb5..baf2c1e0d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml @@ -87,12 +87,18 @@ backend: speculative_config: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 8192 - osl: 1024 - concurrencies: '30' - req_rate: inf + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "16" + DECODE_GPUS: "16" + TOTAL_GPUS: "64" frontend: type: dynamo enable_multiple_frontends: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml index 4d746af13..8be542e76 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml @@ -89,12 +89,18 @@ backend: speculative_config: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 8192 - osl: 1024 - concurrencies: '154' - req_rate: inf + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "16" + DECODE_GPUS: "16" + TOTAL_GPUS: "48" frontend: type: dynamo enable_multiple_frontends: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml index 2f630277e..0bf877f96 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml @@ -89,12 +89,18 @@ backend: num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "154" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "16" + DECODE_GPUS: "16" + TOTAL_GPUS: "48" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml index 9081201ba..b68e4f1a5 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml @@ -81,12 +81,18 @@ backend: backend: UCX stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 8192 - osl: 1024 - concurrencies: '6' - req_rate: inf + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "16" + DECODE_GPUS: "16" + TOTAL_GPUS: "64" frontend: type: dynamo enable_multiple_frontends: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml index 938fd965c..06b713a32 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml @@ -89,12 +89,18 @@ backend: num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "9" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "16" + DECODE_GPUS: "16" + TOTAL_GPUS: "64" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml index c1eb86c19..030c98654 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml @@ -89,12 +89,18 @@ backend: num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "30" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "16" + DECODE_GPUS: "16" + TOTAL_GPUS: "64" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml index 40c84770f..1f882bc75 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml @@ -84,12 +84,18 @@ backend: backend: UCX stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: sa-bench - isl: 8192 - osl: 1024 - concurrencies: '308' - req_rate: inf + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "16" + DECODE_GPUS: "16" + TOTAL_GPUS: "48" frontend: type: dynamo enable_multiple_frontends: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml index 7c3fc7c0e..230e3a281 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml @@ -92,12 +92,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "896" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "64" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml index 4feb8690d..b66e9d91a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml @@ -122,12 +122,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "144" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "80" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml index 522618223..246c12a61 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml @@ -102,12 +102,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "13" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "96" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml index 5be701be2..84c66f292 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml @@ -92,12 +92,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1024" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "40" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml index 6e8464280..898b6b248 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml @@ -92,12 +92,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "352" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "96" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml index 69f96bac7..ff64103a1 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml @@ -120,12 +120,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "44" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "96" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml index a7275865f..04d320697 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml @@ -138,12 +138,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 1 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1024" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "24" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml index b68aae478..af18c65d3 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml @@ -122,12 +122,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "512" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "72" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml index 506a8c580..f0e0f9a58 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml @@ -92,12 +92,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "88" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "96" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml index 5d910619d..eaa74f374 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml @@ -167,12 +167,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1152" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "80" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml index a11789b29..03de93867 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml @@ -132,12 +132,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "144" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "80" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml index 554f516e2..0f29aab2f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml @@ -98,12 +98,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "11" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "80" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml index c48eded81..4393dacf8 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml @@ -86,12 +86,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1536" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "56" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml index 473753df3..9b2d8fbf5 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml @@ -132,12 +132,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "288" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "80" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml index 80784e19d..ee3a951cf 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml @@ -132,12 +132,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "36" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "80" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml index 7c695e47f..6356363ac 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml @@ -167,12 +167,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "3584" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "72" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml index 69d7b8708..ce67bee55 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml @@ -132,12 +132,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "576" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "80" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml index 0c1828f27..a5522bdad 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml @@ -132,12 +132,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "72" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "80" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml index 3bacea3c6..1ad52f9f3 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml @@ -102,12 +102,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 2 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "128" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "24" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml index eaa4536a4..23ad0751a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml @@ -102,12 +102,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 2 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "48" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "32" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml index d84bf05a5..4649032a7 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml @@ -102,12 +102,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "9" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "64" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml index 19fa4c9f0..92ed944df 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml @@ -102,12 +102,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 2 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "256" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "32" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml index 6eca7fe9d..01616d163 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml @@ -102,12 +102,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "160" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "64" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml index 6cfd09aad..78cc69344 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml @@ -102,12 +102,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "28" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "64" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml index ab5a8fa71..607011f5c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml @@ -102,12 +102,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 1 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "512" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "32" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml index 219a6f1b8..02db00cb0 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml @@ -102,12 +102,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 2 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "64" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "16" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml index d8dd374c2..89cefb58e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml @@ -102,12 +102,18 @@ backend: decoding_type: MTP num_nextn_predict_layers: 3 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "48" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "56" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml index b92ecafe9..6f9e2c92e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml @@ -99,12 +99,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "128" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "16" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml index 65eddfb81..a7cc5137e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml @@ -96,12 +96,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "48" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "32" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml index f42e7d15d..82064a374 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml @@ -96,12 +96,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "9" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "64" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml index 5f96d875a..da13164cd 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml @@ -96,12 +96,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "768" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "64" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml index 5f2976b4d..38d63593a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml @@ -96,12 +96,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "160" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "56" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml index 72974bb20..19ba51ba6 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml @@ -96,12 +96,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "28" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "64" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml index a7a96394c..3b35f1299 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml @@ -96,12 +96,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "512" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "32" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml index 2a27575f2..531f573f3 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml @@ -96,12 +96,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "192" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "40" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml index 602646d9c..c8a885d95 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml @@ -96,12 +96,18 @@ backend: stream_interval: 100 num_postprocess_workers: 4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "48" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "56" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep8-dep16.yaml index bf5b441b9..27cc59a91 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep8-dep16.yaml @@ -116,10 +116,16 @@ backend: no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "128x256x1024x2048x4096" - req_rate: "inf" - use_chat_template: false + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "16" + TOTAL_GPUS: "24" + USE_CHAT_TEMPLATE: "false" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml index 63e9e280c..66a2a5219 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml @@ -108,10 +108,16 @@ backend: no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4096x8192" - req_rate: "inf" - use_chat_template: false + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "16" + TOTAL_GPUS: "40" + USE_CHAT_TEMPLATE: "false" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml index 0c872e9c4..4eb66b9ba 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml @@ -148,10 +148,16 @@ backend: enable-sleep-mode: true tokenizer-mode: deepseek_v4 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "1x4x8x16x32x64" - req_rate: "inf" - use_chat_template: false + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "8" + TOTAL_GPUS: "16" + USE_CHAT_TEMPLATE: "false" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml index d6b750bf2..3e6320fc8 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml @@ -103,10 +103,16 @@ backend: no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "512x1024" - req_rate: "inf" - use_chat_template: false + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "16" + TOTAL_GPUS: "40" + USE_CHAT_TEMPLATE: "false" diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-7p1d-dep8-dep16.yaml index 6213373b3..0f5611403 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-7p1d-dep8-dep16.yaml @@ -113,10 +113,16 @@ backend: no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "4096x8192" - req_rate: "inf" - use_chat_template: false + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "8" + DECODE_GPUS: "16" + TOTAL_GPUS: "72" + USE_CHAT_TEMPLATE: "false" diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml index ce3eff436..49a38528d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml @@ -106,12 +106,18 @@ backend: - cutedsl - cuda_core +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "666" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "16" + TOTAL_GPUS: "20" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml index 105b84bfd..c83b4c67b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml @@ -110,12 +110,18 @@ backend: - cutedsl - cuda_core +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "2253" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "32" + TOTAL_GPUS: "36" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml index 9fb194ddc..e5a833580 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml @@ -198,12 +198,18 @@ backend: - cutedsl - cuda_core +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4301x6452" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "12" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml index 5639da411..a56150450 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml @@ -119,12 +119,18 @@ backend: - cutedsl - cuda_core +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4x192x360x668" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "36" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml index f9496feb6..ffb109b8d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml @@ -103,12 +103,18 @@ backend: - cutedsl - cuda_core +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "5x15x30x55" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "4" + TOTAL_GPUS: "24" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml index 71b016c4b..f75876142 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml @@ -134,12 +134,18 @@ backend: - cutedsl - cuda_core +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4301" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "16" + TOTAL_GPUS: "24" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml index 52b75bb4e..7fdf9daea 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml @@ -118,12 +118,18 @@ backend: - cutedsl - cuda_core +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4301" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "32" + TOTAL_GPUS: "40" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml index 8c1f0aa82..bbc7627ee 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml @@ -107,12 +107,18 @@ backend: - cutedsl - cuda_core +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "156" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "4" + TOTAL_GPUS: "20" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml index d4c5086b0..5a0b04c91 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml @@ -104,12 +104,18 @@ backend: - cutedsl - cuda_core +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "4" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "36" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml index 8f6ea063f..90d294ff5 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml @@ -107,12 +107,18 @@ backend: - cutedsl - cuda_core +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "5x15x30x60x105" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "4" + TOTAL_GPUS: "24" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml index 4bfaa0e2c..8cc508d5e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml @@ -105,12 +105,18 @@ backend: - cutedsl - cuda_core +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "333" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "16" + TOTAL_GPUS: "24" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml index d7d51627c..528b0b4f9 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml @@ -107,12 +107,18 @@ backend: - cutedsl - cuda_core +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "615" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "16" + TOTAL_GPUS: "28" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml index e8df1179b..d0dbf80f0 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml @@ -136,12 +136,18 @@ backend: - cutedsl - cuda_core +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "2151" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "28" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml index db1778920..6eb391bba 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml @@ -119,12 +119,18 @@ backend: - cutedsl - cuda_core +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "2253" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "16" + TOTAL_GPUS: "44" frontend: type: "dynamo" diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep4-dep16.yaml index ecdc9233a..c5230d9e5 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep4-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep4-dep16.yaml @@ -93,9 +93,15 @@ backend: stream-interval: 50 max-cudagraph-capture-size: 512 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "256x512x1024x2048x3072x4096" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "16" + TOTAL_GPUS: "20" diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml index 43167b5f3..0992a5091 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml @@ -90,9 +90,15 @@ backend: stream-interval: 50 max-cudagraph-capture-size: 1024 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4x8x16x32x64x128" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "4" + TOTAL_GPUS: "20" diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml index 1ab6ca279..5670a9d54 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml @@ -90,9 +90,15 @@ backend: stream-interval: 50 max-cudagraph-capture-size: 16 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "4x8x16x32x128" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "4" + TOTAL_GPUS: "20" diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-3p1d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-3p1d-dep4-dep16.yaml index ca4e9813f..cecacdfd7 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-3p1d-dep4-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-3p1d-dep4-dep16.yaml @@ -93,9 +93,15 @@ backend: stream-interval: 50 max-cudagraph-capture-size: 256 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "512x1024" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "16" + TOTAL_GPUS: "28" diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-5p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-5p1d-dep4-dep8.yaml index cd9f94a9d..259db9436 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-5p1d-dep4-dep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-5p1d-dep4-dep8.yaml @@ -93,9 +93,15 @@ backend: stream-interval: 50 max-cudagraph-capture-size: 512 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "2048" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "8" + TOTAL_GPUS: "28" diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-6p1d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-6p1d-dep4-dep16.yaml index 47d3d7ee5..0a26d118d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-6p1d-dep4-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-6p1d-dep4-dep16.yaml @@ -93,9 +93,15 @@ backend: stream-interval: 50 max-cudagraph-capture-size: 512 +# Bench client lives in this repo; mounted into the bench container at +# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract. +container_mounts: + "$INFMAX_WORKSPACE": "/infmax-workspace" + benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "3072x4096" - req_rate: "inf" + type: "custom" + command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh" + env: + PREFILL_GPUS: "4" + DECODE_GPUS: "16" + TOTAL_GPUS: "40"