From fab09198e4685b8d1b2b5d885b8f47e101ff2abf Mon Sep 17 00:00:00 2001 From: jthomson04 Date: Mon, 2 Feb 2026 11:48:33 -0800 Subject: [PATCH 1/5] b200 fp8 Signed-off-by: jthomson04 --- .github/configs/nvidia-master.yaml | 435 +++++++++++++++++++++++++++++ 1 file changed, 435 insertions(+) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index e99282490..a35f25c15 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -383,6 +383,441 @@ dsr1-fp4-b200-dynamo-trt: ep: 8 dp-attn: true +dsr1-fp8-b200-dynamo-trt: + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: b200-multinode-slurm + precision: fp8 + framework: dynamo-trt + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # MTP configurations - Low latency (TP attention) + - spec-decoding: "mtp" + conc-list: [8] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml" + decode: + num-worker: 8 + tp: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [32] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml" + decode: + num-worker: 8 + tp: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [64] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml" + decode: + num-worker: 8 + tp: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [256] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml" + decode: + num-worker: 8 + tp: 8 + dp-attn: false + # MTP configurations - High throughput (DP attention) + - spec-decoding: "mtp" + conc-list: [896] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml" + decode: + num-worker: 7 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [1024] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [1184] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [1600] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + + # Non-MTP (STP) configurations - Low latency (TP attention) + - conc-list: [4] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml" + decode: + num-worker: 3 + tp: 8 + dp-attn: false + - conc-list: [32] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml" + decode: + num-worker: 3 + tp: 8 + dp-attn: false + - conc-list: [128] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml" + decode: + num-worker: 3 + tp: 8 + dp-attn: false + # Non-MTP (STP) configurations - High throughput (DP attention) + - conc-list: [1920] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [4096] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [5152] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: true + + - isl: 8192 + osl: 1024 + search-space: + # MTP configurations - Low latency (TP attention) + - spec-decoding: "mtp" + conc-list: [8] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml" + decode: + num-worker: 6 + tp: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [8] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml" + decode: + num-worker: 2 + tp: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [48] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml" + decode: + num-worker: 6 + tp: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [64] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml" + decode: + num-worker: 4 + tp: 8 + dp-attn: false + # MTP configurations - High throughput (DP attention) + - spec-decoding: "mtp" + conc-list: [224] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [288] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [1088] + prefill: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + + # Non-MTP (STP) configurations - Low latency (TP attention) + - conc-list: [1] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml" + decode: + num-worker: 1 + tp: 8 + dp-attn: false + - conc-list: [32] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml" + decode: + num-worker: 4 + tp: 8 + dp-attn: false + - conc-list: [128] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml" + decode: + num-worker: 4 + tp: 8 + dp-attn: false + - conc-list: [96] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml" + decode: + num-worker: 6 + tp: 8 + dp-attn: false + # Non-MTP (STP) configurations - High throughput (DP attention) + - conc-list: [128] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [128] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [256] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [640] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + + dsr1-fp4-b300-dynamo-trt: image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1 model: deepseek-r1-fp4 From d63b5a4fbf2d04d5b9d529408268f84537fee423 Mon Sep 17 00:00:00 2001 From: jthomson04 Date: Thu, 5 Feb 2026 17:04:57 -0800 Subject: [PATCH 2/5] updates Signed-off-by: jthomson04 --- perf-changelog.yaml | 6 ++++++ runners/launch_b200-dgxc-slurm.sh | 6 ++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 20fd381fc..00de1286e 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -400,3 +400,9 @@ - "Update GPT-OSS FP4 B200 TRT pareto configurations and new container image" - "Extend maximum concurrency to 256 across all sequence lengths" pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/639 + +- config-keys: + - dsr1-fp8-b200-dynamo-trt + description: + - "Introduce new DSR1 FP8 B200 Dynamo TRT configurations for 8k1k and 1k1k" + pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/647 \ No newline at end of file diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh index 7a6d31288..b77ca390c 100644 --- a/runners/launch_b200-dgxc-slurm.sh +++ b/runners/launch_b200-dgxc-slurm.sh @@ -34,8 +34,10 @@ export SLURM_ACCOUNT="root" if [[ $MODEL_PREFIX == "dsr1" ]]; then if [[ $PRECISION == "fp4" ]]; then export MODEL_PATH="/lustre/fsw/models/dsr1-0528-nvfp4-v2" + export SRT_SLURM_MODEL_PREFIX="dsr1" elif [[ $PRECISION == "fp8" ]]; then - export MODEL_PATH="/lustre/fsw/models/dsr1-0528-fp8" + export MODEL_PATH="/raid/models/dsr1-0528-fp8" + export SRT_SLURM_MODEL_PREFIX="dsr1-fp8" else echo "Unsupported precision: $PRECISION. Supported precisions are: fp4, fp8" exit 1 @@ -72,7 +74,7 @@ network_interface: "" srtctl_root: "${GITHUB_WORKSPACE}/${SRT_REPO_DIR}" # Model path aliases model_paths: - "${MODEL_PREFIX}": "${MODEL_PATH}" + "${SRT_SLURM_MODEL_PREFIX}": "${MODEL_PATH}" # Container aliases containers: dynamo-trtllm: "${SQUASH_FILE}" From fc9912138b4e096c96cf1fef76c408f2fd93e867 Mon Sep 17 00:00:00 2001 From: jthomson04 Date: Thu, 5 Feb 2026 17:07:53 -0800 Subject: [PATCH 3/5] newline Signed-off-by: jthomson04 --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 00de1286e..13d7b2f25 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -405,4 +405,4 @@ - dsr1-fp8-b200-dynamo-trt description: - "Introduce new DSR1 FP8 B200 Dynamo TRT configurations for 8k1k and 1k1k" - pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/647 \ No newline at end of file + pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/647 From 3ae74afcfa71da11484db7b81112a2ba10b0c652 Mon Sep 17 00:00:00 2001 From: jthomson04 Date: Thu, 5 Feb 2026 17:13:09 -0800 Subject: [PATCH 4/5] Add ep config field Signed-off-by: jthomson04 --- .github/configs/nvidia-master.yaml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index ce8ecae79..63925846f 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -410,6 +410,7 @@ dsr1-fp8-b200-dynamo-trt: decode: num-worker: 8 tp: 8 + ep: 1 dp-attn: false - spec-decoding: "mtp" conc-list: [32] @@ -424,6 +425,7 @@ dsr1-fp8-b200-dynamo-trt: decode: num-worker: 8 tp: 8 + ep: 1 dp-attn: false - spec-decoding: "mtp" conc-list: [64] @@ -438,6 +440,7 @@ dsr1-fp8-b200-dynamo-trt: decode: num-worker: 8 tp: 8 + ep: 1 dp-attn: false - spec-decoding: "mtp" conc-list: [256] @@ -452,6 +455,7 @@ dsr1-fp8-b200-dynamo-trt: decode: num-worker: 8 tp: 8 + ep: 1 dp-attn: false # MTP configurations - High throughput (DP attention) - spec-decoding: "mtp" @@ -528,6 +532,7 @@ dsr1-fp8-b200-dynamo-trt: decode: num-worker: 3 tp: 8 + ep: 1 dp-attn: false - conc-list: [32] prefill: @@ -541,6 +546,7 @@ dsr1-fp8-b200-dynamo-trt: decode: num-worker: 3 tp: 8 + ep: 1 dp-attn: false - conc-list: [128] prefill: @@ -554,6 +560,7 @@ dsr1-fp8-b200-dynamo-trt: decode: num-worker: 3 tp: 8 + ep: 1 dp-attn: false # Non-MTP (STP) configurations - High throughput (DP attention) - conc-list: [1920] @@ -616,6 +623,7 @@ dsr1-fp8-b200-dynamo-trt: decode: num-worker: 6 tp: 8 + ep: 1 dp-attn: false - spec-decoding: "mtp" conc-list: [8] @@ -630,6 +638,7 @@ dsr1-fp8-b200-dynamo-trt: decode: num-worker: 2 tp: 8 + ep: 1 dp-attn: false - spec-decoding: "mtp" conc-list: [48] @@ -644,6 +653,7 @@ dsr1-fp8-b200-dynamo-trt: decode: num-worker: 6 tp: 8 + ep: 1 dp-attn: false - spec-decoding: "mtp" conc-list: [64] @@ -658,6 +668,7 @@ dsr1-fp8-b200-dynamo-trt: decode: num-worker: 4 tp: 8 + ep: 1 dp-attn: false # MTP configurations - High throughput (DP attention) - spec-decoding: "mtp" @@ -719,6 +730,7 @@ dsr1-fp8-b200-dynamo-trt: decode: num-worker: 1 tp: 8 + ep: 1 dp-attn: false - conc-list: [32] prefill: @@ -732,6 +744,7 @@ dsr1-fp8-b200-dynamo-trt: decode: num-worker: 4 tp: 8 + ep: 1 dp-attn: false - conc-list: [128] prefill: @@ -745,6 +758,7 @@ dsr1-fp8-b200-dynamo-trt: decode: num-worker: 4 tp: 8 + ep: 1 dp-attn: false - conc-list: [96] prefill: @@ -758,6 +772,7 @@ dsr1-fp8-b200-dynamo-trt: decode: num-worker: 6 tp: 8 + ep: 1 dp-attn: false # Non-MTP (STP) configurations - High throughput (DP attention) - conc-list: [128] From 8afce6b8ac4df8608cf1eb41f86cc91007df5a82 Mon Sep 17 00:00:00 2001 From: jthomson04 Date: Tue, 10 Feb 2026 08:52:42 -0800 Subject: [PATCH 5/5] fix perf changelog Signed-off-by: jthomson04 --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 129bf5a88..99de5649d 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -496,4 +496,4 @@ - dsr1-fp8-b200-dynamo-trt description: - "Introduce new DSR1 FP8 B200 Dynamo TRT configurations for 8k1k and 1k1k" - pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/647 + pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/616