diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 6b029001d..58f297834 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -74,6 +74,57 @@ dsr1-fp4-b200-trt:
     - { tp: 8, conc-start: 4, conc-end: 32 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 }
 
+dsr1-fp4-b200-trt-mtp:
+  image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
+  model: nvidia/DeepSeek-R1-0528-FP4-V2
+  model-prefix: dsr1
+  runner: b200-trt
+  precision: fp4
+  framework: trt
+  multinode: false
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # If TP=4:
+    #   If CONC >= 16, then EP=4
+    #   If CONC >= 128, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
+    - { tp: 4, conc-start: 4, conc-end: 8, spec-decoding: mtp }
+    - { tp: 4, ep: 4, conc-start: 16, conc-end: 64, spec-decoding: mtp }
+    - { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
+    # If TP=8:
+    #   If CONC >= 16, then EP=8
+    #   If CONC >= 64, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
+    - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp }
+    - { tp: 8, ep: 8, conc-start: 16, conc-end: 32, spec-decoding: mtp }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp }
+  - isl: 1024
+    osl: 8192
+    search-space:
+    # If TP=4:
+    #   If CONC >= 32, then EP=4
+    #   If CONC >= 128, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
+    - { tp: 4, conc-start: 4, conc-end: 16, spec-decoding: mtp }
+    - { tp: 4, ep: 4, conc-start: 32, conc-end: 64, spec-decoding: mtp }
+    - { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
+    # If TP=8:
+    #   If CONC >= 8, then EP=8
+    #   If CONC >= 128, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
+    - { tp: 8, conc-start: 4, conc-end: 4, spec-decoding: mtp }
+    - { tp: 8, ep: 8, conc-start: 8, conc-end: 64, spec-decoding: mtp }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # If TP=4:
+    #   If CONC >= 32, then EP=4, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
+    - { tp: 4, conc-start: 4, conc-end: 16, spec-decoding: mtp }
+    - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 256, spec-decoding: mtp }
+    # If TP=8:
+    #   If CONC >= 32, then EP=8, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
+    - { tp: 8, conc-start: 4, conc-end: 16, spec-decoding: mtp }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256, spec-decoding: mtp }
+
 dsr1-fp8-b200-sglang:
   image: lmsysorg/sglang:v0.5.6-cu129-amd64
   model: deepseek-ai/DeepSeek-R1-0528
@@ -123,6 +174,35 @@ dsr1-fp8-b200-trt:
     # If CONC > 64, then DP_ATTN=true
     - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
 
+dsr1-fp8-b200-trt-mtp:
+  image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: b200-trt
+  precision: fp8
+  framework: trt
+  multinode: false
+  seq-len-configs:
+  # For all sequence lengths, EP=TP, MOE_BACKEND=DEEPGEMM, MTP=3 (or MTP=1 when DP_ATTN=true)
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # If CONC >= 64, then DP_ATTN=true, MTP=1
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp }
+  - isl: 1024
+    osl: 8192
+    search-space:
+    # If CONC >= 128, then DP_ATTN=true, MTP=1
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # If CONC >= 64, then DP_ATTN=true, MTP=1
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp }
+
 dsr1-fp8-h200-sglang:
   image: lmsysorg/sglang:v0.5.6-cu129-amd64
   model: deepseek-ai/DeepSeek-R1-0528
@@ -172,6 +252,35 @@ dsr1-fp8-h200-trt:
     - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64 }
 
+dsr1-fp8-h200-trt-mtp:
+  image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: h200
+  precision: fp8
+  framework: trt
+  multinode: false
+  # For all sequence lengths, EP=TP, MOE_BACKEND=CUTLASS, MTP=3 (or MTP=1 when DP_ATTN=true)
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # If CONC >= 128, then DP_ATTN=true, MTP=1
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
+  - isl: 1024
+    osl: 8192
+    search-space:
+    # If CONC >= 256, then DP_ATTN=true, MTP=1
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 128, spec-decoding: mtp }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256, spec-decoding: mtp }
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # If CONC >= 64, then DP_ATTN=true, MTP=1
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp }
+
 gptoss-fp4-b200-trt:
   image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc2
   model: openai/gpt-oss-120b
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index cc7b81553..fad1c5064 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -92,7 +92,7 @@ wait_for_server_ready() {
 }
 
 # Run benchmark serving with standardized parameters
-# All parameters are required
+# All parameters are required except --use-chat-template
 # Parameters:
 #   --model: Model name
 #   --port: Server port
@@ -104,6 +104,7 @@ wait_for_server_ready() {
 #   --max-concurrency: Max concurrency
 #   --result-filename: Result filename without extension
 #   --result-dir: Result directory
+#   --use-chat-template: Optional flag to enable chat template
 run_benchmark_serving() {
     set +x
     local model=""
@@ -116,6 +117,7 @@ run_benchmark_serving() {
     local max_concurrency=""
     local result_filename=""
     local result_dir=""
+    local use_chat_template=false
 
     # Parse arguments
     while [[ $# -gt 0 ]]; do
@@ -160,6 +162,10 @@ run_benchmark_serving() {
                 result_dir="$2"
                 shift 2
                 ;;
+            --use-chat-template)
+                use_chat_template=true
+                shift
+                ;;
             *)
                 echo "Unknown parameter: $1"
                 return 1
@@ -224,23 +230,33 @@ run_benchmark_serving() {
     local BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
     git clone https://github.com/kimbochen/bench_serving.git "$BENCH_SERVING_DIR"
 
+    # Build benchmark command
+    local benchmark_cmd=(
+        python3 "$BENCH_SERVING_DIR/benchmark_serving.py"
+        --model "$model"
+        --backend "$backend"
+        --base-url "http://0.0.0.0:$port"
+        --dataset-name random
+        --random-input-len "$input_len"
+        --random-output-len "$output_len"
+        --random-range-ratio "$random_range_ratio"
+        --num-prompts "$num_prompts"
+        --max-concurrency "$max_concurrency"
+        --request-rate inf
+        --ignore-eos
+        --save-result
+        --percentile-metrics 'ttft,tpot,itl,e2el'
+        --result-dir "$result_dir"
+        --result-filename "$result_filename.json"
+    )
+    
+    # Add --use-chat-template if requested
+    if [[ "$use_chat_template" == true ]]; then
+        benchmark_cmd+=(--use-chat-template)
+    fi
+
     # Run benchmark
     set -x
-    python3 "$BENCH_SERVING_DIR/benchmark_serving.py" \
-        --model "$model" \
-        --backend "$backend" \
-        --base-url "http://0.0.0.0:$port" \
-        --dataset-name random \
-        --random-input-len "$input_len" \
-        --random-output-len "$output_len" \
-        --random-range-ratio "$random_range_ratio" \
-        --num-prompts "$num_prompts" \
-        --max-concurrency "$max_concurrency" \
-        --request-rate inf \
-        --ignore-eos \
-        --save-result \
-        --percentile-metrics 'ttft,tpot,itl,e2el' \
-        --result-dir "$result_dir" \
-        --result-filename "$result_filename.json"
+    "${benchmark_cmd[@]}"
     set +x
 }
diff --git a/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh
new file mode 100644
index 000000000..52ca1e9e1
--- /dev/null
+++ b/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh
@@ -0,0 +1,103 @@
+#!/usr/bin/env bash
+
+# === Required Env Vars ===
+# MODEL
+# TP
+# CONC
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# RESULT_FILENAME
+# PORT_OFFSET
+# DP_ATTENTION
+# EP_SIZE
+
+echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
+
+hf download $MODEL
+
+# ========= Determine MOE_BACKEND and MTP based on DP_ATTENTION =========
+if [[ "$DP_ATTENTION" == "true" ]]; then
+    MOE_BACKEND="CUTLASS"
+    MTP=1
+else
+    MOE_BACKEND="TRTLLM"
+    MTP=3
+fi
+
+echo "MOE_BACKEND='$MOE_BACKEND', MTP='$MTP'"
+
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+PORT=$(( 8888 + $PORT_OFFSET ))
+EXTRA_CONFIG_FILE="dsr1-fp4-mtp.yml"
+
+cat > $EXTRA_CONFIG_FILE << EOF
+cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 512
+enable_attention_dp: $DP_ATTENTION
+print_iter_log: true
+kv_cache_config:
+    dtype: fp8
+    free_gpu_memory_fraction: 0.8
+    enable_block_reuse: false 
+stream_interval: 10
+moe_config:
+    backend: $MOE_BACKEND
+speculative_config:
+    decoding_type: MTP
+    num_nextn_predict_layers: ${MTP}
+EOF
+
+if [[ "$DP_ATTENTION" == "true" ]]; then
+    cat << EOF >> $EXTRA_CONFIG_FILE
+attention_dp_config:
+    batching_wait_iters: 0
+    enable_balance: true
+    timeout_iters: 60
+EOF
+fi
+
+if [[ "$DP_ATTENTION" == "true" ]]; then
+    MAX_BATCH_SIZE=$((CONC/TP))
+else
+    MAX_BATCH_SIZE=$CONC
+fi
+
+MAX_NUM_TOKENS=$(( ((MTP+1)*MAX_BATCH_SIZE+ISL+64+63)/64*64 ))
+
+set -x
+# Launch TRT-LLM server
+mpirun -n 1 --oversubscribe --allow-run-as-root \
+    trtllm-serve $MODEL --port=$PORT \
+    --trust_remote_code \
+    --backend=pytorch \
+    --max_batch_size=$MAX_BATCH_SIZE \
+    --max_seq_len=$MAX_MODEL_LEN \
+    --max_num_tokens=$MAX_NUM_TOKENS \
+    --tp_size=$TP --ep_size=$EP_SIZE \
+    --extra_llm_api_options=$EXTRA_CONFIG_FILE \
+    > $SERVER_LOG 2>&1 &
+
+SERVER_PID=$!
+
+# Source benchmark utilities
+source "$(dirname "$0")/benchmark_lib.sh"
+
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend openai \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts $(( $CONC * 10 )) \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
diff --git a/benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh
new file mode 100644
index 000000000..519e49089
--- /dev/null
+++ b/benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh
@@ -0,0 +1,103 @@
+#!/usr/bin/env bash
+
+# === Required Env Vars ===
+# MODEL
+# TP
+# CONC
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# RESULT_FILENAME
+# PORT_OFFSET
+# DP_ATTENTION
+# EP_SIZE
+
+echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
+
+hf download $MODEL
+
+# ========= Determine MOE_BACKEND and MTP based on DP_ATTENTION =========
+MOE_BACKEND="DEEPGEMM"
+
+if [[ "$DP_ATTENTION" == "true" ]]; then
+    MTP=1
+else
+    MTP=3
+fi
+
+echo "MOE_BACKEND='$MOE_BACKEND', MTP='$MTP'"
+
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+PORT=$(( 8888 + $PORT_OFFSET ))
+EXTRA_CONFIG_FILE="dsr1-fp8-mtp.yml"
+
+cat > $EXTRA_CONFIG_FILE << EOF
+cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 256
+enable_attention_dp: $DP_ATTENTION
+print_iter_log: true
+kv_cache_config:
+    dtype: fp8
+    free_gpu_memory_fraction: 0.8
+    enable_block_reuse: false 
+stream_interval: 10
+moe_config:
+    backend: $MOE_BACKEND
+speculative_config:
+    decoding_type: MTP
+    num_nextn_predict_layers: ${MTP}
+EOF
+
+if [[ "$DP_ATTENTION" == "true" ]]; then
+    cat << EOF >> $EXTRA_CONFIG_FILE
+attention_dp_config:
+    batching_wait_iters: 0
+    enable_balance: true
+    timeout_iters: 60
+EOF
+fi
+
+if [[ "$DP_ATTENTION" == "true" ]]; then
+    MAX_BATCH_SIZE=$((CONC/TP))
+else
+    MAX_BATCH_SIZE=$CONC
+fi
+
+MAX_NUM_TOKENS=$(( ((MTP+1)*MAX_BATCH_SIZE+ISL+64+63)/64*64 ))
+
+set -x
+# Launch TRT-LLM server
+mpirun -n 1 --oversubscribe --allow-run-as-root \
+    trtllm-serve $MODEL --port=$PORT \
+    --trust_remote_code \
+    --backend=pytorch \
+    --max_batch_size=$MAX_BATCH_SIZE \
+    --max_seq_len=$MAX_MODEL_LEN \
+    --max_num_tokens=$MAX_NUM_TOKENS \
+    --tp_size=$TP --ep_size=$EP_SIZE \
+    --extra_llm_api_options=$EXTRA_CONFIG_FILE \
+    > $SERVER_LOG 2>&1 &
+
+SERVER_PID=$!
+
+# Source benchmark utilities
+source "$(dirname "$0")/benchmark_lib.sh"
+
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend openai \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts $(( $CONC * 10 )) \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
diff --git a/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh
new file mode 100644
index 000000000..ee252098b
--- /dev/null
+++ b/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh
@@ -0,0 +1,108 @@
+#!/usr/bin/env bash
+
+# === Required Env Vars ===
+# MODEL
+# TP
+# CONC
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# RESULT_FILENAME
+# PORT_OFFSET
+# DP_ATTENTION
+# EP_SIZE
+
+echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
+
+hf download $MODEL
+
+# ========= Determine MOE_BACKEND and MTP based on DP_ATTENTION =========
+MOE_BACKEND="CUTLASS"
+
+if [[ "$DP_ATTENTION" == "true" ]]; then
+    MTP=1
+else
+    MTP=3
+fi
+
+echo "MOE_BACKEND='$MOE_BACKEND', MTP='$MTP'"
+
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+PORT=$(( 8888 + $PORT_OFFSET ))
+EXTRA_CONFIG_FILE="dsr1-fp8-mtp.yml"
+
+# If ISL=8192 and DP_ATTENTION=true, export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:8192
+if [[ "$ISL" == "8192" && "$DP_ATTENTION" == "true" ]]; then
+    export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:8192"
+fi
+
+cat > $EXTRA_CONFIG_FILE << EOF
+cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 128
+enable_attention_dp: $DP_ATTENTION
+print_iter_log: true
+kv_cache_config:
+    dtype: fp8
+    free_gpu_memory_fraction: 0.75
+    enable_block_reuse: false 
+stream_interval: 10
+moe_config:
+    backend: $MOE_BACKEND
+speculative_config:
+    decoding_type: MTP
+    num_nextn_predict_layers: ${MTP}
+EOF
+
+if [[ "$DP_ATTENTION" == "true" ]]; then
+    cat << EOF >> $EXTRA_CONFIG_FILE
+attention_dp_config:
+    batching_wait_iters: 0
+    enable_balance: true
+    timeout_iters: 60
+EOF
+fi
+
+if [[ "$DP_ATTENTION" == "true" ]]; then
+    MAX_BATCH_SIZE=$((CONC/TP))
+else
+    MAX_BATCH_SIZE=$CONC
+fi
+
+MAX_NUM_TOKENS=$(( ((MTP+1)*MAX_BATCH_SIZE+ISL+64+63)/64*64 ))
+
+set -x
+# Launch TRT-LLM server
+PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \
+    trtllm-serve $MODEL --port=$PORT \
+    --trust_remote_code \
+    --backend=pytorch \
+    --max_batch_size=$MAX_BATCH_SIZE \
+    --max_seq_len=$MAX_MODEL_LEN \
+    --max_num_tokens=$MAX_NUM_TOKENS \
+    --tp_size=$TP --ep_size=$EP_SIZE \
+    --extra_llm_api_options=$EXTRA_CONFIG_FILE \
+    > $SERVER_LOG 2>&1 &
+
+SERVER_PID=$!
+
+# Source benchmark utilities
+source "$(dirname "$0")/benchmark_lib.sh"
+
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend openai \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts $(( $CONC * 10 )) \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 21f2e8655..0a87e23c4 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -162,3 +162,12 @@
   description:
     - "Updating MI355x Deepseek-R1 FP4 SGLang Image to upstream v0.5.7"
   pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/395
+
+- config-keys:
+    - dsr1-fp4-b200-trt-mtp
+    - dsr1-fp8-b200-trt-mtp
+    - dsr1-fp8-h200-trt-mtp
+  description:
+    - Add MTP (Multi-Token Prediction) support for single-node TRT configs
+    - Add spec-decoding field to config entries and update launch scripts to select MTP benchmark scripts
+  pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/412
diff --git a/runners/launch_b200-nb.sh b/runners/launch_b200-nb.sh
index 1cb5c3dd1..08ed1e455 100644
--- a/runners/launch_b200-nb.sh
+++ b/runners/launch_b200-nb.sh
@@ -3,6 +3,7 @@
 HF_HUB_CACHE_MOUNT="/mnt/data/hf-hub-cache-${USER: -1}/"
 PARTITION="main"
 FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
+SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
 
 UCX_NET_DEVICES=eth0
 
@@ -19,4 +20,4 @@ srun --partition=$PARTITION --gres=gpu:$TP --exclusive \
 --container-writable \
 --container-workdir=/workspace/ \
 --no-container-entrypoint --export=ALL,PORT_OFFSET=${USER: -1},UCX_NET_DEVICES=$UCX_NET_DEVICES \
-bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_slurm.sh
\ No newline at end of file
+bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}_slurm.sh
\ No newline at end of file
diff --git a/runners/launch_b200-nv.sh b/runners/launch_b200-nv.sh
index 28286e2be..1305c6848 100644
--- a/runners/launch_b200-nv.sh
+++ b/runners/launch_b200-nv.sh
@@ -5,6 +5,7 @@ export PORT_OFFSET=0  # Doesn't matter when --exclusive
 
 MODEL_CODE="${EXP_NAME%%_*}"
 FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
+SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
 
 PARTITION="dgx-b200"
 SQUASH_FILE="/raid/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
@@ -22,6 +23,6 @@ srun --jobid=$JOB_ID \
 --container-writable \
 --container-workdir=/workspace/ \
 --no-container-entrypoint --export=ALL \
-bash benchmarks/${MODEL_CODE}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_slurm.sh
+bash benchmarks/${MODEL_CODE}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}_slurm.sh
 
 scancel $JOB_ID
diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh
index 0b6740d7b..f72b8bb0d 100644
--- a/runners/launch_h200-cw.sh
+++ b/runners/launch_h200-cw.sh
@@ -5,6 +5,7 @@ export PORT_OFFSET=${USER: -1}
 
 MODEL_CODE="${EXP_NAME%%_*}"
 FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
+SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
 
 PARTITION="h200"
 SQUASH_FILE="/mnt/vast/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
@@ -31,7 +32,7 @@ srun --jobid=$JOB_ID \
 --container-mount-home \
 --container-workdir=/workspace/ \
 --no-container-entrypoint --export=ALL \
-bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}_slurm.sh
+bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}_slurm.sh
 
 rmdir $SAGEMAKER_SHM_PATH
 scancel $JOB_ID
diff --git a/runners/launch_h200-nb.sh b/runners/launch_h200-nb.sh
index 15b6fa6c5..703bcf231 100644
--- a/runners/launch_h200-nb.sh
+++ b/runners/launch_h200-nb.sh
@@ -5,6 +5,7 @@ export PORT_OFFSET=${USER: -1}
 
 MODEL_CODE="${EXP_NAME%%_*}"
 FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
+SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
 
 PARTITION="main"
 SQUASH_FILE="/home/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
@@ -29,6 +30,6 @@ srun --jobid=$JOB_ID \
 --container-mount-home \
 --container-workdir=/workspace/ \
 --no-container-entrypoint --export=ALL \
-bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}_slurm.sh
+bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}_slurm.sh
 
 scancel $JOB_ID
diff --git a/runners/launch_h200-nv.sh b/runners/launch_h200-nv.sh
index 3282be1a8..0434f880b 100644
--- a/runners/launch_h200-nv.sh
+++ b/runners/launch_h200-nv.sh
@@ -5,6 +5,7 @@ export PORT_OFFSET=0  # Doesn't matter when --exclusive
 
 MODEL_CODE="${EXP_NAME%%_*}"
 FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
+SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
 
 PARTITION="dgx-h200"
 SQUASH_FILE="/raid/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
@@ -22,6 +23,6 @@ srun --jobid=$JOB_ID \
 --container-mount-home \
 --container-workdir=/workspace/ \
 --no-container-entrypoint --export=ALL \
-bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}_slurm.sh
+bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}_slurm.sh
 
 scancel $JOB_ID