From 6ce87790fbac9e44ec41ab990f2b8cffe816e063 Mon Sep 17 00:00:00 2001
From: shicli <shicli@nvidia.com>
Date: Wed, 7 Jan 2026 10:42:37 +0800
Subject: [PATCH 01/19] Add refactored MTP benchmarks for dsr1 TRT

- Add dsr1_fp4_b200_trt_mtp_slurm.sh with MTP support
- Add dsr1_fp8_b200_trt_mtp_slurm.sh with MTP support
- Add dsr1_fp8_h200_trt_mtp_slurm.sh with MTP support
- Refactored to use benchmark_lib.sh utilities
- Use wait_for_server_ready and run_benchmark_serving functions
---
 benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh | 154 ++++++++++++++++++++++
 benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh | 115 ++++++++++++++++
 benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh | 110 ++++++++++++++++
 3 files changed, 379 insertions(+)
 create mode 100644 benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh
 create mode 100644 benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh
 create mode 100644 benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh

diff --git a/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh
new file mode 100644
index 000000000..8200e27a7
--- /dev/null
+++ b/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh
@@ -0,0 +1,154 @@
+#!/usr/bin/env bash
+
+# === Required Env Vars ===
+# MODEL
+# TP
+# CONC
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# RESULT_FILENAME
+# PORT_OFFSET
+
+echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"
+
+hf download $MODEL
+
+# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC, TP =========
+EP_SIZE="1"
+MOE_BACKEND="TRTLLM"
+DP_ATTENTION=false
+MTP=3
+
+if [[ "$TP" == "4" ]]; then
+    if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
+        if [[ $CONC -ge 16 ]]; then
+            EP_SIZE="$TP"
+        fi
+        if [[ $CONC -ge 128 ]]; then
+            DP_ATTENTION=true
+            MOE_BACKEND="CUTLASS"
+            MTP=1
+        fi
+    elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
+        if [[ $CONC -ge 32 ]]; then
+            EP_SIZE="$TP"
+        fi
+        if [[ $CONC -ge 128 ]]; then
+            DP_ATTENTION=true
+            MOE_BACKEND="CUTLASS"
+            MTP=1
+        fi
+    elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
+        if [[ $CONC -ge 32 ]]; then
+            EP_SIZE="$TP"
+            DP_ATTENTION=true
+            MOE_BACKEND="CUTLASS"
+            MTP=1
+        fi
+    fi
+elif [[ "$TP" == "8" ]]; then
+    if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
+        if [[ $CONC -ge 16 ]]; then
+            EP_SIZE="$TP"
+        fi
+        if [[ $CONC -ge 64 ]]; then
+            DP_ATTENTION=true
+            MOE_BACKEND="CUTLASS"
+            MTP=1
+        fi
+    elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
+        if [[ $CONC -ge 8 ]]; then
+            EP_SIZE="$TP"
+        fi
+        if [[ $CONC -ge 128 ]]; then
+            DP_ATTENTION=true
+            MOE_BACKEND="CUTLASS"
+            MTP=1
+        fi
+    elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
+        if [[ $CONC -ge 32 ]]; then
+            EP_SIZE="$TP"
+            DP_ATTENTION=true
+            MOE_BACKEND="CUTLASS"
+            MTP=1
+        fi
+    fi
+fi
+
+echo "EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION', MTP='$MTP'"
+
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+PORT=$(( 8888 + $PORT_OFFSET ))
+EXTRA_CONFIG_FILE="dsr1-fp4-mtp.yml"
+
+cat > $EXTRA_CONFIG_FILE << EOF
+cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 512
+enable_attention_dp: $DP_ATTENTION
+print_iter_log: true
+kv_cache_config:
+    dtype: fp8
+    free_gpu_memory_fraction: 0.8
+    enable_block_reuse: false 
+stream_interval: 10
+moe_config:
+    backend: $MOE_BACKEND
+speculative_config:
+    decoding_type: MTP
+    num_nextn_predict_layers: ${MTP}
+EOF
+
+if [[ "$DP_ATTENTION" == "true" ]]; then
+    cat << EOF >> $EXTRA_CONFIG_FILE
+attention_dp_config:
+    batching_wait_iters: 0
+    enable_balance: true
+    timeout_iters: 60
+EOF
+fi
+
+if [[ "$DP_ATTENTION" == "true" ]]; then
+    MAX_BATCH_SIZE=$((CONC/TP))
+else
+    MAX_BATCH_SIZE=$CONC
+fi
+
+MAX_NUM_TOKENS=$(( ((MTP+1)*MAX_BATCH_SIZE+ISL+64+63)/64*64 ))
+
+set -x
+# Launch TRT-LLM server
+mpirun -n 1 --oversubscribe --allow-run-as-root \
+    trtllm-serve $MODEL --port=$PORT \
+    --trust_remote_code \
+    --backend=pytorch \
+    --max_batch_size=$MAX_BATCH_SIZE \
+    --max_seq_len=$MAX_MODEL_LEN \
+    --max_num_tokens=$MAX_NUM_TOKENS \
+    --tp_size=$TP --ep_size=$EP_SIZE \
+    --extra_llm_api_options=$EXTRA_CONFIG_FILE \
+    > $SERVER_LOG 2>&1 &
+
+SERVER_PID=$!
+
+# Source benchmark utilities
+source "$(dirname "$0")/benchmark_lib.sh"
+
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend openai \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts $(( $CONC * 10 )) \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
diff --git a/benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh
new file mode 100644
index 000000000..8a6795edc
--- /dev/null
+++ b/benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh
@@ -0,0 +1,115 @@
+#!/usr/bin/env bash
+
+# === Required Env Vars ===
+# MODEL
+# TP
+# CONC
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# RESULT_FILENAME
+# PORT_OFFSET
+
+echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"
+
+hf download $MODEL
+
+# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC =========
+EP_SIZE="$TP"
+MOE_BACKEND="DEEPGEMM"
+DP_ATTENTION=false
+MTP=3
+
+if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
+    if [[ $CONC -ge 64 ]]; then
+        DP_ATTENTION=true
+        MTP=1
+    fi
+elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
+    if [[ $CONC -ge 128 ]]; then
+        DP_ATTENTION=true
+        MTP=1
+    fi
+elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
+    if [[ $CONC -ge 64 ]]; then
+        DP_ATTENTION=true
+        MTP=1
+    fi
+fi
+
+echo "EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION', MTP='$MTP'"
+
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+PORT=$(( 8888 + $PORT_OFFSET ))
+EXTRA_CONFIG_FILE="dsr1-fp8-mtp.yml"
+
+cat > $EXTRA_CONFIG_FILE << EOF
+cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 256
+enable_attention_dp: $DP_ATTENTION
+print_iter_log: true
+kv_cache_config:
+    dtype: fp8
+    free_gpu_memory_fraction: 0.8
+    enable_block_reuse: false 
+stream_interval: 10
+moe_config:
+    backend: $MOE_BACKEND
+speculative_config:
+    decoding_type: MTP
+    num_nextn_predict_layers: ${MTP}
+EOF
+
+if [[ "$DP_ATTENTION" == "true" ]]; then
+    cat << EOF >> $EXTRA_CONFIG_FILE
+attention_dp_config:
+    batching_wait_iters: 0
+    enable_balance: true
+    timeout_iters: 60
+EOF
+fi
+
+if [[ "$DP_ATTENTION" == "true" ]]; then
+    MAX_BATCH_SIZE=$((CONC/TP))
+else
+    MAX_BATCH_SIZE=$CONC
+fi
+
+MAX_NUM_TOKENS=$(( ((MTP+1)*MAX_BATCH_SIZE+ISL+64+63)/64*64 ))
+
+set -x
+# Launch TRT-LLM server
+mpirun -n 1 --oversubscribe --allow-run-as-root \
+    trtllm-serve $MODEL --port=$PORT \
+    --trust_remote_code \
+    --backend=pytorch \
+    --max_batch_size=$MAX_BATCH_SIZE \
+    --max_seq_len=$MAX_MODEL_LEN \
+    --max_num_tokens=$MAX_NUM_TOKENS \
+    --tp_size=$TP --ep_size=$EP_SIZE \
+    --extra_llm_api_options=$EXTRA_CONFIG_FILE \
+    > $SERVER_LOG 2>&1 &
+
+SERVER_PID=$!
+
+# Source benchmark utilities
+source "$(dirname "$0")/benchmark_lib.sh"
+
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend openai \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts $(( $CONC * 10 )) \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
diff --git a/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh
new file mode 100644
index 000000000..065ab7603
--- /dev/null
+++ b/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh
@@ -0,0 +1,110 @@
+#!/usr/bin/env bash
+
+# === Required Env Vars ===
+# MODEL
+# TP
+# CONC
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# RESULT_FILENAME
+# PORT_OFFSET
+
+echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"
+
+hf download $MODEL
+
+# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC =========
+EP_SIZE="$TP"
+MOE_BACKEND="CUTLASS"
+DP_ATTENTION=false
+MTP=3
+
+if [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
+    if [[ $CONC -ge 256 ]]; then
+        DP_ATTENTION=true
+        MTP=1
+    fi
+elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
+    if [[ $CONC -ge 64 ]]; then
+        DP_ATTENTION=true
+        MTP=1
+    fi
+fi
+
+echo "EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION', MTP='$MTP'"
+
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+PORT=$(( 8888 + $PORT_OFFSET ))
+EXTRA_CONFIG_FILE="dsr1-fp8-mtp.yml"
+
+cat > $EXTRA_CONFIG_FILE << EOF
+cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 128
+enable_attention_dp: $DP_ATTENTION
+print_iter_log: true
+kv_cache_config:
+    dtype: fp8
+    free_gpu_memory_fraction: 0.75
+    enable_block_reuse: false 
+stream_interval: 10
+moe_config:
+    backend: $MOE_BACKEND
+speculative_config:
+    decoding_type: MTP
+    num_nextn_predict_layers: ${MTP}
+EOF
+
+if [[ "$DP_ATTENTION" == "true" ]]; then
+    cat << EOF >> $EXTRA_CONFIG_FILE
+attention_dp_config:
+    batching_wait_iters: 0
+    enable_balance: true
+    timeout_iters: 60
+EOF
+fi
+
+if [[ "$DP_ATTENTION" == "true" ]]; then
+    MAX_BATCH_SIZE=$((CONC/TP))
+else
+    MAX_BATCH_SIZE=$CONC
+fi
+
+MAX_NUM_TOKENS=$(( ((MTP+1)*MAX_BATCH_SIZE+ISL+64+63)/64*64 ))
+
+set -x
+# Launch TRT-LLM server
+PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \
+    trtllm-serve $MODEL --port=$PORT \
+    --trust_remote_code \
+    --backend=pytorch \
+    --max_batch_size=$MAX_BATCH_SIZE \
+    --max_seq_len=$MAX_MODEL_LEN \
+    --max_num_tokens=$MAX_NUM_TOKENS \
+    --tp_size=$TP --ep_size=$EP_SIZE \
+    --extra_llm_api_options=$EXTRA_CONFIG_FILE \
+    > $SERVER_LOG 2>&1 &
+
+SERVER_PID=$!
+
+# Source benchmark utilities
+source "$(dirname "$0")/benchmark_lib.sh"
+
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend openai \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts $(( $CONC * 10 )) \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/

From 92eca1a79e34e11afc34e01bd898ba69d3e5e3a0 Mon Sep 17 00:00:00 2001
From: shicli <shicli@nvidia.com>
Date: Wed, 7 Jan 2026 10:50:07 +0800
Subject: [PATCH 02/19] Add --use-chat-template support for MTP benchmarks

- Extended benchmark_lib.sh run_benchmark_serving() to support optional --use-chat-template flag
- Added --use-chat-template to all three MTP benchmark scripts
- This is required for MTP mode to work correctly
---
 benchmarks/benchmark_lib.sh               | 50 +++++++++++++++--------
 benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh |  3 +-
 benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh |  3 +-
 benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh |  3 +-
 4 files changed, 39 insertions(+), 20 deletions(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index cc7b81553..fad1c5064 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -92,7 +92,7 @@ wait_for_server_ready() {
 }
 
 # Run benchmark serving with standardized parameters
-# All parameters are required
+# All parameters are required except --use-chat-template
 # Parameters:
 #   --model: Model name
 #   --port: Server port
@@ -104,6 +104,7 @@ wait_for_server_ready() {
 #   --max-concurrency: Max concurrency
 #   --result-filename: Result filename without extension
 #   --result-dir: Result directory
+#   --use-chat-template: Optional flag to enable chat template
 run_benchmark_serving() {
     set +x
     local model=""
@@ -116,6 +117,7 @@ run_benchmark_serving() {
     local max_concurrency=""
     local result_filename=""
     local result_dir=""
+    local use_chat_template=false
 
     # Parse arguments
     while [[ $# -gt 0 ]]; do
@@ -160,6 +162,10 @@ run_benchmark_serving() {
                 result_dir="$2"
                 shift 2
                 ;;
+            --use-chat-template)
+                use_chat_template=true
+                shift
+                ;;
             *)
                 echo "Unknown parameter: $1"
                 return 1
@@ -224,23 +230,33 @@ run_benchmark_serving() {
     local BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
     git clone https://github.com/kimbochen/bench_serving.git "$BENCH_SERVING_DIR"
 
+    # Build benchmark command
+    local benchmark_cmd=(
+        python3 "$BENCH_SERVING_DIR/benchmark_serving.py"
+        --model "$model"
+        --backend "$backend"
+        --base-url "http://0.0.0.0:$port"
+        --dataset-name random
+        --random-input-len "$input_len"
+        --random-output-len "$output_len"
+        --random-range-ratio "$random_range_ratio"
+        --num-prompts "$num_prompts"
+        --max-concurrency "$max_concurrency"
+        --request-rate inf
+        --ignore-eos
+        --save-result
+        --percentile-metrics 'ttft,tpot,itl,e2el'
+        --result-dir "$result_dir"
+        --result-filename "$result_filename.json"
+    )
+    
+    # Add --use-chat-template if requested
+    if [[ "$use_chat_template" == true ]]; then
+        benchmark_cmd+=(--use-chat-template)
+    fi
+
     # Run benchmark
     set -x
-    python3 "$BENCH_SERVING_DIR/benchmark_serving.py" \
-        --model "$model" \
-        --backend "$backend" \
-        --base-url "http://0.0.0.0:$port" \
-        --dataset-name random \
-        --random-input-len "$input_len" \
-        --random-output-len "$output_len" \
-        --random-range-ratio "$random_range_ratio" \
-        --num-prompts "$num_prompts" \
-        --max-concurrency "$max_concurrency" \
-        --request-rate inf \
-        --ignore-eos \
-        --save-result \
-        --percentile-metrics 'ttft,tpot,itl,e2el' \
-        --result-dir "$result_dir" \
-        --result-filename "$result_filename.json"
+    "${benchmark_cmd[@]}"
     set +x
 }
diff --git a/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh
index 8200e27a7..15160202b 100644
--- a/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh
+++ b/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh
@@ -151,4 +151,5 @@ run_benchmark_serving \
     --num-prompts $(( $CONC * 10 )) \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
-    --result-dir /workspace/
+    --result-dir /workspace/ \
+    --use-chat-template
diff --git a/benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh
index 8a6795edc..41224b696 100644
--- a/benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh
+++ b/benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh
@@ -112,4 +112,5 @@ run_benchmark_serving \
     --num-prompts $(( $CONC * 10 )) \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
-    --result-dir /workspace/
+    --result-dir /workspace/ \
+    --use-chat-template
diff --git a/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh
index 065ab7603..99d2e3d20 100644
--- a/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh
@@ -107,4 +107,5 @@ run_benchmark_serving \
     --num-prompts $(( $CONC * 10 )) \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
-    --result-dir /workspace/
+    --result-dir /workspace/ \
+    --use-chat-template

From fddf14edf99b544d83680307bd6c9b99f0be512c Mon Sep 17 00:00:00 2001
From: shicli <shicli@nvidia.com>
Date: Wed, 7 Jan 2026 11:10:45 +0800
Subject: [PATCH 03/19] Add MTP benchmark configurations to nvidia-master.yaml

- Add dsr1-fp4-b200-trt-mtp configuration with EP/DP_ATTN/MTP logic
- Add dsr1-fp8-b200-trt-mtp configuration with EP/DP_ATTN/MTP logic
- Add dsr1-fp8-h200-trt-mtp configuration with EP/DP_ATTN/MTP logic
- Configurations align with benchmark script logic for dynamic EP_SIZE, MOE_BACKEND, and MTP values
---
 .github/configs/nvidia-master.yaml | 108 +++++++++++++++++++++++++++++
 1 file changed, 108 insertions(+)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 34d85fcca..4b72cc498 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -74,6 +74,57 @@ dsr1-fp4-b200-trt:
     - { tp: 8, conc-start: 4, conc-end: 32 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 }
 
+dsr1-fp4-b200-trt-mtp:
+  image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
+  model: nvidia/DeepSeek-R1-0528-FP4-V2
+  model-prefix: dsr1
+  runner: b200-trt-mtp
+  precision: fp4
+  framework: trt
+  multinode: false
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # If TP=4:
+    #   If CONC >= 16, then EP=4
+    #   If CONC >= 128, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
+    - { tp: 4, conc-start: 4, conc-end: 8 }
+    - { tp: 4, ep: 4, conc-start: 16, conc-end: 64 }
+    - { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 256 }
+    # If TP=8:
+    #   If CONC >= 16, then EP=8
+    #   If CONC >= 64, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
+    - { tp: 8, conc-start: 4, conc-end: 8 }
+    - { tp: 8, ep: 8, conc-start: 16, conc-end: 32 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 }
+  - isl: 1024
+    osl: 8192
+    search-space:
+    # If TP=4:
+    #   If CONC >= 32, then EP=4
+    #   If CONC >= 128, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
+    - { tp: 4, conc-start: 4, conc-end: 16 }
+    - { tp: 4, ep: 4, conc-start: 32, conc-end: 64 }
+    - { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 256 }
+    # If TP=8:
+    #   If CONC >= 8, then EP=8
+    #   If CONC >= 128, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
+    - { tp: 8, conc-start: 4, conc-end: 4 }
+    - { tp: 8, ep: 8, conc-start: 8, conc-end: 64 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 }
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # If TP=4:
+    #   If CONC >= 32, then EP=4, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
+    - { tp: 4, conc-start: 4, conc-end: 16 }
+    - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 256 }
+    # If TP=8:
+    #   If CONC >= 32, then EP=8, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
+    - { tp: 8, conc-start: 4, conc-end: 16 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256 }
+
 dsr1-fp8-b200-sglang:
   image: lmsysorg/sglang:v0.5.6-cu129-amd64
   model: deepseek-ai/DeepSeek-R1-0528
@@ -123,6 +174,35 @@ dsr1-fp8-b200-trt:
     # If CONC > 64, then DP_ATTN=true
     - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
 
+dsr1-fp8-b200-trt-mtp:
+  image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: b200-trt-mtp
+  precision: fp8
+  framework: trt
+  multinode: false
+  seq-len-configs:
+  # For all sequence lengths, EP=TP, MOE_BACKEND=DEEPGEMM, MTP=3 (or MTP=1 when DP_ATTN=true)
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # If CONC >= 64, then DP_ATTN=true, MTP=1
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 }
+  - isl: 1024
+    osl: 8192
+    search-space:
+    # If CONC >= 128, then DP_ATTN=true, MTP=1
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 }
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # If CONC >= 64, then DP_ATTN=true, MTP=1
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 }
+
 dsr1-fp8-h200-sglang:
   image: lmsysorg/sglang:v0.5.6-cu129-amd64
   model: deepseek-ai/DeepSeek-R1-0528
@@ -172,6 +252,34 @@ dsr1-fp8-h200-trt:
     - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64 }
 
+dsr1-fp8-h200-trt-mtp:
+  image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: h200-trt-mtp
+  precision: fp8
+  framework: trt
+  multinode: false
+  # For all sequence lengths, EP=TP, MOE_BACKEND=CUTLASS, MTP=3 (or MTP=1 when DP_ATTN=true)
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # MTP=3 for all, no DP_ATTN in this sequence length
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+  - isl: 1024
+    osl: 8192
+    search-space:
+    # If CONC >= 256, then DP_ATTN=true, MTP=1
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256 }
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # If CONC >= 64, then DP_ATTN=true, MTP=1
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 }
+
 gptoss-fp4-b200-trt:
   image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc2
   model: openai/gpt-oss-120b

From 59b38b984859504c819c0c262084343db1713c60 Mon Sep 17 00:00:00 2001
From: shicli <shicli@nvidia.com>
Date: Wed, 7 Jan 2026 11:30:27 +0800
Subject: [PATCH 04/19] Refactor MTP benchmarks to receive EP_SIZE and
 DP_ATTENTION from env vars

- Remove duplicate EP_SIZE/DP_ATTENTION calculation logic from MTP scripts
- MTP scripts now receive EP_SIZE and DP_ATTENTION as env vars from YAML config (like non-MTP scripts)
- Only calculate MOE_BACKEND and MTP values based on DP_ATTENTION flag
- Simplifies scripts from 156/117/112 lines to 104 lines each
- Eliminates redundant logic between YAML configs and bash scripts
---
 benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh | 73 ++++-------------------
 benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh | 32 ++++------
 benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh | 27 ++++-----
 3 files changed, 31 insertions(+), 101 deletions(-)

diff --git a/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh
index 15160202b..33d819efa 100644
--- a/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh
+++ b/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh
@@ -10,76 +10,25 @@
 # RANDOM_RANGE_RATIO
 # RESULT_FILENAME
 # PORT_OFFSET
+# DP_ATTENTION
+# EP_SIZE
 
 echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 
-echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
 
 hf download $MODEL
 
-# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC, TP =========
-EP_SIZE="1"
-MOE_BACKEND="TRTLLM"
-DP_ATTENTION=false
-MTP=3
-
-if [[ "$TP" == "4" ]]; then
-    if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-        if [[ $CONC -ge 16 ]]; then
-            EP_SIZE="$TP"
-        fi
-        if [[ $CONC -ge 128 ]]; then
-            DP_ATTENTION=true
-            MOE_BACKEND="CUTLASS"
-            MTP=1
-        fi
-    elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-        if [[ $CONC -ge 32 ]]; then
-            EP_SIZE="$TP"
-        fi
-        if [[ $CONC -ge 128 ]]; then
-            DP_ATTENTION=true
-            MOE_BACKEND="CUTLASS"
-            MTP=1
-        fi
-    elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
-        if [[ $CONC -ge 32 ]]; then
-            EP_SIZE="$TP"
-            DP_ATTENTION=true
-            MOE_BACKEND="CUTLASS"
-            MTP=1
-        fi
-    fi
-elif [[ "$TP" == "8" ]]; then
-    if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-        if [[ $CONC -ge 16 ]]; then
-            EP_SIZE="$TP"
-        fi
-        if [[ $CONC -ge 64 ]]; then
-            DP_ATTENTION=true
-            MOE_BACKEND="CUTLASS"
-            MTP=1
-        fi
-    elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-        if [[ $CONC -ge 8 ]]; then
-            EP_SIZE="$TP"
-        fi
-        if [[ $CONC -ge 128 ]]; then
-            DP_ATTENTION=true
-            MOE_BACKEND="CUTLASS"
-            MTP=1
-        fi
-    elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
-        if [[ $CONC -ge 32 ]]; then
-            EP_SIZE="$TP"
-            DP_ATTENTION=true
-            MOE_BACKEND="CUTLASS"
-            MTP=1
-        fi
-    fi
+# ========= Determine MOE_BACKEND and MTP based on DP_ATTENTION =========
+if [[ "$DP_ATTENTION" == "true" ]]; then
+    MOE_BACKEND="CUTLASS"
+    MTP=1
+else
+    MOE_BACKEND="TRTLLM"
+    MTP=3
 fi
 
-echo "EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION', MTP='$MTP'"
+echo "MOE_BACKEND='$MOE_BACKEND', MTP='$MTP'"
 
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 PORT=$(( 8888 + $PORT_OFFSET ))
diff --git a/benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh
index 41224b696..9054bd014 100644
--- a/benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh
+++ b/benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh
@@ -10,37 +10,25 @@
 # RANDOM_RANGE_RATIO
 # RESULT_FILENAME
 # PORT_OFFSET
+# DP_ATTENTION
+# EP_SIZE
 
 echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 
-echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
 
 hf download $MODEL
 
-# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC =========
-EP_SIZE="$TP"
+# ========= Determine MOE_BACKEND and MTP based on DP_ATTENTION =========
 MOE_BACKEND="DEEPGEMM"
-DP_ATTENTION=false
-MTP=3
-
-if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-    if [[ $CONC -ge 64 ]]; then
-        DP_ATTENTION=true
-        MTP=1
-    fi
-elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-    if [[ $CONC -ge 128 ]]; then
-        DP_ATTENTION=true
-        MTP=1
-    fi
-elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
-    if [[ $CONC -ge 64 ]]; then
-        DP_ATTENTION=true
-        MTP=1
-    fi
+
+if [[ "$DP_ATTENTION" == "true" ]]; then
+    MTP=1
+else
+    MTP=3
 fi
 
-echo "EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION', MTP='$MTP'"
+echo "MOE_BACKEND='$MOE_BACKEND', MTP='$MTP'"
 
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 PORT=$(( 8888 + $PORT_OFFSET ))
diff --git a/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh
index 99d2e3d20..b7063f395 100644
--- a/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh
@@ -10,32 +10,25 @@
 # RANDOM_RANGE_RATIO
 # RESULT_FILENAME
 # PORT_OFFSET
+# DP_ATTENTION
+# EP_SIZE
 
 echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 
-echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
 
 hf download $MODEL
 
-# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC =========
-EP_SIZE="$TP"
+# ========= Determine MOE_BACKEND and MTP based on DP_ATTENTION =========
 MOE_BACKEND="CUTLASS"
-DP_ATTENTION=false
-MTP=3
-
-if [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-    if [[ $CONC -ge 256 ]]; then
-        DP_ATTENTION=true
-        MTP=1
-    fi
-elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
-    if [[ $CONC -ge 64 ]]; then
-        DP_ATTENTION=true
-        MTP=1
-    fi
+
+if [[ "$DP_ATTENTION" == "true" ]]; then
+    MTP=1
+else
+    MTP=3
 fi
 
-echo "EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION', MTP='$MTP'"
+echo "MOE_BACKEND='$MOE_BACKEND', MTP='$MTP'"
 
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 PORT=$(( 8888 + $PORT_OFFSET ))

From f385b80e4ec7e807c0a9486147d346d22c17c6a8 Mon Sep 17 00:00:00 2001
From: shicli <shicli@nvidia.com>
Date: Wed, 7 Jan 2026 11:35:47 +0800
Subject: [PATCH 05/19] Fix MTP benchmark configurations to match original
 script logic

- Fix dsr1-fp4-b200-trt-mtp conc ranges to match EP_SIZE conditions
- Fix dsr1-fp8-b200-trt-mtp conc ranges to match DP_ATTENTION conditions
- Fix dsr1-fp8-h200-trt-mtp conc ranges to match DP_ATTENTION conditions
- All configurations now accurately reflect the original bash script conditional logic
---
 .github/configs/nvidia-master.yaml | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 4b72cc498..1c38e6827 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -89,14 +89,14 @@ dsr1-fp4-b200-trt-mtp:
     # If TP=4:
     #   If CONC >= 16, then EP=4
     #   If CONC >= 128, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
-    - { tp: 4, conc-start: 4, conc-end: 8 }
-    - { tp: 4, ep: 4, conc-start: 16, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 16 }
+    - { tp: 4, ep: 4, conc-start: 16, conc-end: 128 }
     - { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 256 }
     # If TP=8:
     #   If CONC >= 16, then EP=8
     #   If CONC >= 64, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
-    - { tp: 8, conc-start: 4, conc-end: 8 }
-    - { tp: 8, ep: 8, conc-start: 16, conc-end: 32 }
+    - { tp: 8, conc-start: 4, conc-end: 16 }
+    - { tp: 8, ep: 8, conc-start: 16, conc-end: 64 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 }
   - isl: 1024
     osl: 8192
@@ -104,25 +104,25 @@ dsr1-fp4-b200-trt-mtp:
     # If TP=4:
     #   If CONC >= 32, then EP=4
     #   If CONC >= 128, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
-    - { tp: 4, conc-start: 4, conc-end: 16 }
-    - { tp: 4, ep: 4, conc-start: 32, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 32 }
+    - { tp: 4, ep: 4, conc-start: 32, conc-end: 128 }
     - { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 256 }
     # If TP=8:
     #   If CONC >= 8, then EP=8
     #   If CONC >= 128, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
-    - { tp: 8, conc-start: 4, conc-end: 4 }
-    - { tp: 8, ep: 8, conc-start: 8, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 8 }
+    - { tp: 8, ep: 8, conc-start: 8, conc-end: 128 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 }
   - isl: 8192
     osl: 1024
     search-space:
     # If TP=4:
     #   If CONC >= 32, then EP=4, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
-    - { tp: 4, conc-start: 4, conc-end: 16 }
+    - { tp: 4, conc-start: 4, conc-end: 32 }
     - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 256 }
     # If TP=8:
     #   If CONC >= 32, then EP=8, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
-    - { tp: 8, conc-start: 4, conc-end: 16 }
+    - { tp: 8, conc-start: 4, conc-end: 32 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256 }
 
 dsr1-fp8-b200-sglang:
@@ -188,19 +188,19 @@ dsr1-fp8-b200-trt-mtp:
     osl: 1024
     search-space:
     # If CONC >= 64, then DP_ATTN=true, MTP=1
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 }
   - isl: 1024
     osl: 8192
     search-space:
     # If CONC >= 128, then DP_ATTN=true, MTP=1
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 }
   - isl: 8192
     osl: 1024
     search-space:
     # If CONC >= 64, then DP_ATTN=true, MTP=1
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 }
 
 dsr1-fp8-h200-sglang:
@@ -271,13 +271,13 @@ dsr1-fp8-h200-trt-mtp:
     osl: 8192
     search-space:
     # If CONC >= 256, then DP_ATTN=true, MTP=1
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256 }
   - isl: 8192
     osl: 1024
     search-space:
     # If CONC >= 64, then DP_ATTN=true, MTP=1
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 }
 
 gptoss-fp4-b200-trt:

From c5f4550c13f60238b1f72d42a0d93450d966910c Mon Sep 17 00:00:00 2001
From: shicli <shicli@nvidia.com>
Date: Wed, 7 Jan 2026 11:40:54 +0800
Subject: [PATCH 06/19] Align MTP conc ranges to powers of 2

- Change conc-end from 64 to 32 for dsr1-fp8-b200-trt-mtp ISL=1024/OSL=1024
- Change conc-end from 128 to 64 for dsr1-fp8-b200-trt-mtp ISL=1024/OSL=8192
- Change conc-end from 64 to 32 for dsr1-fp8-b200-trt-mtp ISL=8192/OSL=1024
- Change conc-end from 256 to 128 for dsr1-fp8-h200-trt-mtp ISL=1024/OSL=8192
- Change conc-end from 64 to 32 for dsr1-fp8-h200-trt-mtp ISL=8192/OSL=1024
- All concurrency ranges now align to powers of 2: 4, 8, 16, 32, 64, 128, 256
---
 .github/configs/nvidia-master.yaml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 1c38e6827..87b6eaf50 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -188,19 +188,19 @@ dsr1-fp8-b200-trt-mtp:
     osl: 1024
     search-space:
     # If CONC >= 64, then DP_ATTN=true, MTP=1
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 }
   - isl: 1024
     osl: 8192
     search-space:
     # If CONC >= 128, then DP_ATTN=true, MTP=1
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 }
   - isl: 8192
     osl: 1024
     search-space:
     # If CONC >= 64, then DP_ATTN=true, MTP=1
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 }
 
 dsr1-fp8-h200-sglang:
@@ -271,13 +271,13 @@ dsr1-fp8-h200-trt-mtp:
     osl: 8192
     search-space:
     # If CONC >= 256, then DP_ATTN=true, MTP=1
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256 }
   - isl: 8192
     osl: 1024
     search-space:
     # If CONC >= 64, then DP_ATTN=true, MTP=1
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 }
 
 gptoss-fp4-b200-trt:

From dd2a82eaaf243b51b79b5b40b20e5c2ea34a23ea Mon Sep 17 00:00:00 2001
From: shicli <shicli@nvidia.com>
Date: Wed, 7 Jan 2026 11:45:13 +0800
Subject: [PATCH 07/19] Fix conc range overlaps in dsr1-fp4-b200-trt-mtp

- Remove overlapping boundaries between conc ranges
- Change ranges to avoid overlap: 4-8, 16-64, 128-256 (with gaps at 9-15, 65-127)
- All ranges now use powers of 2 boundaries without overlap
- Applies to all TP/ISL/OSL combinations in dsr1-fp4-b200-trt-mtp
---
 .github/configs/nvidia-master.yaml | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 87b6eaf50..4b72cc498 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -89,14 +89,14 @@ dsr1-fp4-b200-trt-mtp:
     # If TP=4:
     #   If CONC >= 16, then EP=4
     #   If CONC >= 128, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
-    - { tp: 4, conc-start: 4, conc-end: 16 }
-    - { tp: 4, ep: 4, conc-start: 16, conc-end: 128 }
+    - { tp: 4, conc-start: 4, conc-end: 8 }
+    - { tp: 4, ep: 4, conc-start: 16, conc-end: 64 }
     - { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 256 }
     # If TP=8:
     #   If CONC >= 16, then EP=8
     #   If CONC >= 64, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
-    - { tp: 8, conc-start: 4, conc-end: 16 }
-    - { tp: 8, ep: 8, conc-start: 16, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 8 }
+    - { tp: 8, ep: 8, conc-start: 16, conc-end: 32 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 }
   - isl: 1024
     osl: 8192
@@ -104,25 +104,25 @@ dsr1-fp4-b200-trt-mtp:
     # If TP=4:
     #   If CONC >= 32, then EP=4
     #   If CONC >= 128, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
-    - { tp: 4, conc-start: 4, conc-end: 32 }
-    - { tp: 4, ep: 4, conc-start: 32, conc-end: 128 }
+    - { tp: 4, conc-start: 4, conc-end: 16 }
+    - { tp: 4, ep: 4, conc-start: 32, conc-end: 64 }
     - { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 256 }
     # If TP=8:
     #   If CONC >= 8, then EP=8
     #   If CONC >= 128, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
-    - { tp: 8, conc-start: 4, conc-end: 8 }
-    - { tp: 8, ep: 8, conc-start: 8, conc-end: 128 }
+    - { tp: 8, conc-start: 4, conc-end: 4 }
+    - { tp: 8, ep: 8, conc-start: 8, conc-end: 64 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 }
   - isl: 8192
     osl: 1024
     search-space:
     # If TP=4:
     #   If CONC >= 32, then EP=4, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
-    - { tp: 4, conc-start: 4, conc-end: 32 }
+    - { tp: 4, conc-start: 4, conc-end: 16 }
     - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 256 }
     # If TP=8:
     #   If CONC >= 32, then EP=8, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
-    - { tp: 8, conc-start: 4, conc-end: 32 }
+    - { tp: 8, conc-start: 4, conc-end: 16 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256 }
 
 dsr1-fp8-b200-sglang:

From b5c542b2ec2836438ed3017c872244a7c2275618 Mon Sep 17 00:00:00 2001
From: shicli <shicli@nvidia.com>
Date: Wed, 7 Jan 2026 15:21:44 +0800
Subject: [PATCH 08/19] larger h200 concurrency

---
 .github/configs/nvidia-master.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 4b72cc498..980020253 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -266,7 +266,7 @@ dsr1-fp8-h200-trt-mtp:
     osl: 1024
     search-space:
     # MTP=3 for all, no DP_ATTN in this sequence length
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 }
   - isl: 1024
     osl: 8192
     search-space:

From cdebd62f62dfba6f65e177af53b7c81ea8ef1168 Mon Sep 17 00:00:00 2001
From: shicli <shicli@nvidia.com>
Date: Wed, 7 Jan 2026 15:34:24 +0800
Subject: [PATCH 09/19] fix runner

---
 .github/configs/nvidia-master.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 980020253..86c3dccea 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -74,11 +74,11 @@ dsr1-fp4-b200-trt:
     - { tp: 8, conc-start: 4, conc-end: 32 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 }
 
-dsr1-fp4-b200-trt-mtp:
+dsr1-fp4-b200-trt:
   image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
   model: nvidia/DeepSeek-R1-0528-FP4-V2
   model-prefix: dsr1
-  runner: b200-trt-mtp
+  runner: b200-trt
   precision: fp4
   framework: trt
   multinode: false
@@ -174,11 +174,11 @@ dsr1-fp8-b200-trt:
     # If CONC > 64, then DP_ATTN=true
     - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
 
-dsr1-fp8-b200-trt-mtp:
+dsr1-fp8-b200-trt:
   image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
   model: deepseek-ai/DeepSeek-R1-0528
   model-prefix: dsr1
-  runner: b200-trt-mtp
+  runner: b200-trt
   precision: fp8
   framework: trt
   multinode: false

From 835c156af98b3bbb063eff0c24a289c9be90cc65 Mon Sep 17 00:00:00 2001
From: shicli <shicli@nvidia.com>
Date: Wed, 7 Jan 2026 15:39:16 +0800
Subject: [PATCH 10/19] fix typo

---
 .github/configs/nvidia-master.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 86c3dccea..8a84770a8 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -74,7 +74,7 @@ dsr1-fp4-b200-trt:
     - { tp: 8, conc-start: 4, conc-end: 32 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 }
 
-dsr1-fp4-b200-trt:
+dsr1-fp4-b200-trt-mtp:
   image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
   model: nvidia/DeepSeek-R1-0528-FP4-V2
   model-prefix: dsr1
@@ -174,7 +174,7 @@ dsr1-fp8-b200-trt:
     # If CONC > 64, then DP_ATTN=true
     - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
 
-dsr1-fp8-b200-trt:
+dsr1-fp8-b200-trt-mtp:
   image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
   model: deepseek-ai/DeepSeek-R1-0528
   model-prefix: dsr1

From f879ca434b3490ff6405534861b7d119ec9affb5 Mon Sep 17 00:00:00 2001
From: shicli <shicli@nvidia.com>
Date: Wed, 7 Jan 2026 15:40:34 +0800
Subject: [PATCH 11/19] fix h200 runner

---
 .github/configs/nvidia-master.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 8a84770a8..6fec1b558 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -256,7 +256,7 @@ dsr1-fp8-h200-trt-mtp:
   image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
   model: deepseek-ai/DeepSeek-R1-0528
   model-prefix: dsr1
-  runner: h200-trt-mtp
+  runner: h200-trt
   precision: fp8
   framework: trt
   multinode: false

From a5f5ebf2b39c9689f28371aecbeae25d3c64f40e Mon Sep 17 00:00:00 2001
From: shicli <shicli@nvidia.com>
Date: Wed, 7 Jan 2026 15:41:40 +0800
Subject: [PATCH 12/19] fix h200 runner

---
 .github/configs/nvidia-master.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 6fec1b558..0d5a357cb 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -256,7 +256,7 @@ dsr1-fp8-h200-trt-mtp:
   image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
   model: deepseek-ai/DeepSeek-R1-0528
   model-prefix: dsr1
-  runner: h200-trt
+  runner: h200
   precision: fp8
   framework: trt
   multinode: false

From 92fe8725987b3a19a32c9cb0d586e2c71976c7d3 Mon Sep 17 00:00:00 2001
From: ankursingh-nv <ankusingh@nvidia.com>
Date: Wed, 7 Jan 2026 10:27:58 -0800
Subject: [PATCH 13/19] Add MTP support for single-node TRT configs and launch
 Scripts

---
 .github/configs/nvidia-master.yaml | 54 +++++++++++++++---------------
 runners/launch_b200-nb.sh          |  3 +-
 runners/launch_b200-nv.sh          |  3 +-
 runners/launch_h200-cw.sh          |  3 +-
 runners/launch_h200-nb.sh          |  3 +-
 runners/launch_h200-nv.sh          |  3 +-
 6 files changed, 37 insertions(+), 32 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 0d5a357cb..1ae9c90f5 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -89,41 +89,41 @@ dsr1-fp4-b200-trt-mtp:
     # If TP=4:
     #   If CONC >= 16, then EP=4
     #   If CONC >= 128, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
-    - { tp: 4, conc-start: 4, conc-end: 8 }
-    - { tp: 4, ep: 4, conc-start: 16, conc-end: 64 }
-    - { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 256 }
+    - { tp: 4, conc-start: 4, conc-end: 8, spec-decoding: mtp }
+    - { tp: 4, ep: 4, conc-start: 16, conc-end: 64, spec-decoding: mtp }
+    - { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
     # If TP=8:
     #   If CONC >= 16, then EP=8
     #   If CONC >= 64, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
-    - { tp: 8, conc-start: 4, conc-end: 8 }
-    - { tp: 8, ep: 8, conc-start: 16, conc-end: 32 }
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 }
+    - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp }
+    - { tp: 8, ep: 8, conc-start: 16, conc-end: 32, spec-decoding: mtp }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp }
   - isl: 1024
     osl: 8192
     search-space:
     # If TP=4:
     #   If CONC >= 32, then EP=4
     #   If CONC >= 128, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
-    - { tp: 4, conc-start: 4, conc-end: 16 }
-    - { tp: 4, ep: 4, conc-start: 32, conc-end: 64 }
-    - { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 256 }
+    - { tp: 4, conc-start: 4, conc-end: 16, spec-decoding: mtp }
+    - { tp: 4, ep: 4, conc-start: 32, conc-end: 64, spec-decoding: mtp }
+    - { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
     # If TP=8:
     #   If CONC >= 8, then EP=8
     #   If CONC >= 128, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
-    - { tp: 8, conc-start: 4, conc-end: 4 }
-    - { tp: 8, ep: 8, conc-start: 8, conc-end: 64 }
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 }
+    - { tp: 8, conc-start: 4, conc-end: 4, spec-decoding: mtp }
+    - { tp: 8, ep: 8, conc-start: 8, conc-end: 64, spec-decoding: mtp }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
   - isl: 8192
     osl: 1024
     search-space:
     # If TP=4:
     #   If CONC >= 32, then EP=4, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
-    - { tp: 4, conc-start: 4, conc-end: 16 }
-    - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 256 }
+    - { tp: 4, conc-start: 4, conc-end: 16, spec-decoding: mtp }
+    - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 256, spec-decoding: mtp }
     # If TP=8:
     #   If CONC >= 32, then EP=8, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
-    - { tp: 8, conc-start: 4, conc-end: 16 }
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256 }
+    - { tp: 8, conc-start: 4, conc-end: 16, spec-decoding: mtp }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256, spec-decoding: mtp }
 
 dsr1-fp8-b200-sglang:
   image: lmsysorg/sglang:v0.5.6-cu129-amd64
@@ -188,20 +188,20 @@ dsr1-fp8-b200-trt-mtp:
     osl: 1024
     search-space:
     # If CONC >= 64, then DP_ATTN=true, MTP=1
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp }
   - isl: 1024
     osl: 8192
     search-space:
     # If CONC >= 128, then DP_ATTN=true, MTP=1
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
   - isl: 8192
     osl: 1024
     search-space:
     # If CONC >= 64, then DP_ATTN=true, MTP=1
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp }
 
 dsr1-fp8-h200-sglang:
   image: lmsysorg/sglang:v0.5.6-cu129-amd64
@@ -266,19 +266,19 @@ dsr1-fp8-h200-trt-mtp:
     osl: 1024
     search-space:
     # MTP=3 for all, no DP_ATTN in this sequence length
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp }
   - isl: 1024
     osl: 8192
     search-space:
     # If CONC >= 256, then DP_ATTN=true, MTP=1
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 }
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 128, spec-decoding: mtp }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256, spec-decoding: mtp }
   - isl: 8192
     osl: 1024
     search-space:
     # If CONC >= 64, then DP_ATTN=true, MTP=1
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp }
 
 gptoss-fp4-b200-trt:
   image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc2
diff --git a/runners/launch_b200-nb.sh b/runners/launch_b200-nb.sh
index 1cb5c3dd1..08ed1e455 100644
--- a/runners/launch_b200-nb.sh
+++ b/runners/launch_b200-nb.sh
@@ -3,6 +3,7 @@
 HF_HUB_CACHE_MOUNT="/mnt/data/hf-hub-cache-${USER: -1}/"
 PARTITION="main"
 FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
+SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
 
 UCX_NET_DEVICES=eth0
 
@@ -19,4 +20,4 @@ srun --partition=$PARTITION --gres=gpu:$TP --exclusive \
 --container-writable \
 --container-workdir=/workspace/ \
 --no-container-entrypoint --export=ALL,PORT_OFFSET=${USER: -1},UCX_NET_DEVICES=$UCX_NET_DEVICES \
-bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_slurm.sh
\ No newline at end of file
+bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}_slurm.sh
\ No newline at end of file
diff --git a/runners/launch_b200-nv.sh b/runners/launch_b200-nv.sh
index 28286e2be..1305c6848 100644
--- a/runners/launch_b200-nv.sh
+++ b/runners/launch_b200-nv.sh
@@ -5,6 +5,7 @@ export PORT_OFFSET=0  # Doesn't matter when --exclusive
 
 MODEL_CODE="${EXP_NAME%%_*}"
 FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
+SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
 
 PARTITION="dgx-b200"
 SQUASH_FILE="/raid/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
@@ -22,6 +23,6 @@ srun --jobid=$JOB_ID \
 --container-writable \
 --container-workdir=/workspace/ \
 --no-container-entrypoint --export=ALL \
-bash benchmarks/${MODEL_CODE}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_slurm.sh
+bash benchmarks/${MODEL_CODE}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}_slurm.sh
 
 scancel $JOB_ID
diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh
index 0b6740d7b..f72b8bb0d 100644
--- a/runners/launch_h200-cw.sh
+++ b/runners/launch_h200-cw.sh
@@ -5,6 +5,7 @@ export PORT_OFFSET=${USER: -1}
 
 MODEL_CODE="${EXP_NAME%%_*}"
 FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
+SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
 
 PARTITION="h200"
 SQUASH_FILE="/mnt/vast/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
@@ -31,7 +32,7 @@ srun --jobid=$JOB_ID \
 --container-mount-home \
 --container-workdir=/workspace/ \
 --no-container-entrypoint --export=ALL \
-bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}_slurm.sh
+bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}_slurm.sh
 
 rmdir $SAGEMAKER_SHM_PATH
 scancel $JOB_ID
diff --git a/runners/launch_h200-nb.sh b/runners/launch_h200-nb.sh
index 15b6fa6c5..703bcf231 100644
--- a/runners/launch_h200-nb.sh
+++ b/runners/launch_h200-nb.sh
@@ -5,6 +5,7 @@ export PORT_OFFSET=${USER: -1}
 
 MODEL_CODE="${EXP_NAME%%_*}"
 FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
+SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
 
 PARTITION="main"
 SQUASH_FILE="/home/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
@@ -29,6 +30,6 @@ srun --jobid=$JOB_ID \
 --container-mount-home \
 --container-workdir=/workspace/ \
 --no-container-entrypoint --export=ALL \
-bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}_slurm.sh
+bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}_slurm.sh
 
 scancel $JOB_ID
diff --git a/runners/launch_h200-nv.sh b/runners/launch_h200-nv.sh
index 3282be1a8..0434f880b 100644
--- a/runners/launch_h200-nv.sh
+++ b/runners/launch_h200-nv.sh
@@ -5,6 +5,7 @@ export PORT_OFFSET=0  # Doesn't matter when --exclusive
 
 MODEL_CODE="${EXP_NAME%%_*}"
 FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
+SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
 
 PARTITION="dgx-h200"
 SQUASH_FILE="/raid/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
@@ -22,6 +23,6 @@ srun --jobid=$JOB_ID \
 --container-mount-home \
 --container-workdir=/workspace/ \
 --no-container-entrypoint --export=ALL \
-bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}_slurm.sh
+bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}_slurm.sh
 
 scancel $JOB_ID

From 48f17a774cb5ceb5df7d043ca18da43a27725880 Mon Sep 17 00:00:00 2001
From: ankursingh-nv <ankusingh@nvidia.com>
Date: Wed, 7 Jan 2026 10:35:49 -0800
Subject: [PATCH 14/19] Add MTP configs to perf-changelog

---
 perf-changelog.yaml | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index c7f68885c..9d361003f 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -141,3 +141,20 @@
   description:
     - Use upstream SGLang images on mi300, mi325 and mi355 for dsr1fp8
   pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/332
+
+- config-keys:
+    - gptoss-fp4-gb200-dynamo-trt
+    - gptoss-fp4-b200-trt
+  description:
+    - Explicitly add EP=TP for DP attention configs for B200 AGG nvidia-master file. Multinode Refactor inadvertently changed default EP=1
+    - Add GPTOSS DISAGG configurations for GB200 1k1k and 8k1k.
+  pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/387
+
+- config-keys:
+    - dsr1-fp4-b200-trt-mtp
+    - dsr1-fp8-b200-trt-mtp
+    - dsr1-fp8-h200-trt-mtp
+  description:
+    - Add MTP (Multi-Token Prediction) support for single-node TRT configs
+    - Add spec-decoding field to config entries and update launch scripts to select MTP benchmark scripts
+  pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/392

From 71f5d4af70c89ebec7ba210006621743897eb13e Mon Sep 17 00:00:00 2001
From: ankursingh-nv <ankusingh@nvidia.com>
Date: Wed, 7 Jan 2026 14:27:10 -0800
Subject: [PATCH 15/19] fix perf-changelog

---
 perf-changelog.yaml | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index ceaece321..38e0b242e 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -150,13 +150,6 @@
     - Add GPTOSS DISAGG configurations for GB200 1k1k and 8k1k.
   pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/387
 
-- config-keys:
-    - dsr1-fp8-mi355x-sglang-disagg
-  description:
-    - "Add PD disaggregation (1P2D) for Mi355X"
-    - "Includes with and without speculative decoding"
-  pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/348
-
 - config-keys:
     - dsr1-fp4-b200-trt-mtp
     - dsr1-fp8-b200-trt-mtp

From ba1a206e646a578708d51d8c641b0c098ec9501a Mon Sep 17 00:00:00 2001
From: shicli <shicli@nvidia.com>
Date: Thu, 8 Jan 2026 10:16:46 +0800
Subject: [PATCH 16/19] fix H200 config

---
 .github/configs/nvidia-master.yaml        | 5 +++--
 benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh | 5 +++++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 57be1d150..58f297834 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -265,8 +265,9 @@ dsr1-fp8-h200-trt-mtp:
   - isl: 1024
     osl: 1024
     search-space:
-    # MTP=3 for all, no DP_ATTN in this sequence length
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp }
+    # If CONC >= 128, then DP_ATTN=true, MTP=1
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
   - isl: 1024
     osl: 8192
     search-space:
diff --git a/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh
index b7063f395..4e869764a 100644
--- a/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh
@@ -34,6 +34,11 @@ SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 PORT=$(( 8888 + $PORT_OFFSET ))
 EXTRA_CONFIG_FILE="dsr1-fp8-mtp.yml"
 
+# If ISL=8192 and DP_ATTENTION=true, export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:8192
+if [[ "$ISL" == "8192" && "$DP_ATTENTION" == "true" ]]; then
+    export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:8192"
+fi
+
 cat > $EXTRA_CONFIG_FILE << EOF
 cuda_graph_config:
     enable_padding: true

From b1f04df94bcab81c1408e1be17e3509c364147c7 Mon Sep 17 00:00:00 2001
From: ankursingh-nv <ankusingh@nvidia.com>
Date: Thu, 8 Jan 2026 15:06:36 -0800
Subject: [PATCH 17/19] fix per-changelog

fix perf-changelog file
---
 perf-changelog.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index bf0b26d15..d23dd87d9 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -150,7 +150,7 @@
     - Add GPTOSS DISAGG configurations for GB200 1k1k and 8k1k.
   pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/387
 
-- config-keys: 
+- config-keys:
     - dsr1-fp8-mi355x-sglang-disagg
   description:
     - "Add PD disaggregation (1P2D) for Mi355X"
@@ -164,4 +164,4 @@
   description:
     - Add MTP (Multi-Token Prediction) support for single-node TRT configs
     - Add spec-decoding field to config entries and update launch scripts to select MTP benchmark scripts
-  pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/392
\ No newline at end of file
+  pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/392

From dae544a58cfa8c604fff63da831f3bb5a5d68761 Mon Sep 17 00:00:00 2001
From: ankursingh-nv <ankusingh@nvidia.com>
Date: Mon, 12 Jan 2026 10:29:12 -0800
Subject: [PATCH 18/19] no chat template

---
 benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh | 3 +--
 benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh | 3 +--
 benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh | 3 +--
 perf-changelog.yaml                       | 9 +++------
 4 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh
index 33d819efa..52ca1e9e1 100644
--- a/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh
+++ b/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh
@@ -100,5 +100,4 @@ run_benchmark_serving \
     --num-prompts $(( $CONC * 10 )) \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
-    --result-dir /workspace/ \
-    --use-chat-template
+    --result-dir /workspace/
diff --git a/benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh
index 9054bd014..519e49089 100644
--- a/benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh
+++ b/benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh
@@ -100,5 +100,4 @@ run_benchmark_serving \
     --num-prompts $(( $CONC * 10 )) \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
-    --result-dir /workspace/ \
-    --use-chat-template
+    --result-dir /workspace/
diff --git a/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh
index 4e869764a..ee252098b 100644
--- a/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh
@@ -105,5 +105,4 @@ run_benchmark_serving \
     --num-prompts $(( $CONC * 10 )) \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
-    --result-dir /workspace/ \
-    --use-chat-template
+    --result-dir /workspace/
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index d23dd87d9..21f2e8655 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -158,10 +158,7 @@
   pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/348
 
 - config-keys:
-    - dsr1-fp4-b200-trt-mtp
-    - dsr1-fp8-b200-trt-mtp
-    - dsr1-fp8-h200-trt-mtp
+    - dsr1-fp4-mi355x-sglang
   description:
-    - Add MTP (Multi-Token Prediction) support for single-node TRT configs
-    - Add spec-decoding field to config entries and update launch scripts to select MTP benchmark scripts
-  pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/392
+    - "Updating MI355x Deepseek-R1 FP4 SGLang Image to upstream v0.5.7"
+  pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/395

From 055084fc4dee0bcdf7d08d6c5427d46106907ad3 Mon Sep 17 00:00:00 2001
From: ankursingh-nv <ankusingh@nvidia.com>
Date: Mon, 12 Jan 2026 10:34:24 -0800
Subject: [PATCH 19/19] update perf-changelog

---
 perf-changelog.yaml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 21f2e8655..af44b69ac 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -162,3 +162,12 @@
   description:
     - "Updating MI355x Deepseek-R1 FP4 SGLang Image to upstream v0.5.7"
   pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/395
+
+- config-keys:
+    - dsr1-fp4-b200-trt-mtp
+    - dsr1-fp8-b200-trt-mtp
+    - dsr1-fp8-h200-trt-mtp
+  description:
+    - Add MTP (Multi-Token Prediction) support for single-node TRT configs
+    - Add spec-decoding field to config entries and update launch scripts to select MTP benchmark scripts
+  pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/412
\ No newline at end of file