SemiAnalysisAI · ankursingh-nv · Jan 7, 2026 · Jan 7, 2026 · Jan 7, 2026 · Jan 7, 2026
@@ -74,6 +74,57 @@ dsr1-fp4-b200-trt:
     - { tp: 8, conc-start: 4, conc-end: 32 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 }
 
+dsr1-fp4-b200-trt-mtp:
+  image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
+  model: nvidia/DeepSeek-R1-0528-FP4-V2
+  model-prefix: dsr1
+  runner: b200-trt
+  precision: fp4
+  framework: trt
+  multinode: false
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # If TP=4:
+    #   If CONC >= 16, then EP=4
+    #   If CONC >= 128, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
+    - { tp: 4, conc-start: 4, conc-end: 8, spec-decoding: mtp }
+    - { tp: 4, ep: 4, conc-start: 16, conc-end: 64, spec-decoding: mtp }
+    - { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
+    # If TP=8:
+    #   If CONC >= 16, then EP=8
+    #   If CONC >= 64, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
+    - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp }
+    - { tp: 8, ep: 8, conc-start: 16, conc-end: 32, spec-decoding: mtp }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp }
+  - isl: 1024
+    osl: 8192
+    search-space:
+    # If TP=4:
+    #   If CONC >= 32, then EP=4
+    #   If CONC >= 128, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
+    - { tp: 4, conc-start: 4, conc-end: 16, spec-decoding: mtp }
+    - { tp: 4, ep: 4, conc-start: 32, conc-end: 64, spec-decoding: mtp }
+    - { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
+    # If TP=8:
+    #   If CONC >= 8, then EP=8
+    #   If CONC >= 128, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
+    - { tp: 8, conc-start: 4, conc-end: 4, spec-decoding: mtp }
+    - { tp: 8, ep: 8, conc-start: 8, conc-end: 64, spec-decoding: mtp }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # If TP=4:
+    #   If CONC >= 32, then EP=4, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
+    - { tp: 4, conc-start: 4, conc-end: 16, spec-decoding: mtp }
+    - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 256, spec-decoding: mtp }
+    # If TP=8:
+    #   If CONC >= 32, then EP=8, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
+    - { tp: 8, conc-start: 4, conc-end: 16, spec-decoding: mtp }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256, spec-decoding: mtp }
+
 dsr1-fp8-b200-sglang:
   image: lmsysorg/sglang:v0.5.6-cu129-amd64
   model: deepseek-ai/DeepSeek-R1-0528
@@ -123,6 +174,35 @@ dsr1-fp8-b200-trt:
     # If CONC > 64, then DP_ATTN=true
     - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
 
+dsr1-fp8-b200-trt-mtp:
+  image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: b200-trt
+  precision: fp8
+  framework: trt
+  multinode: false
+  seq-len-configs:
+  # For all sequence lengths, EP=TP, MOE_BACKEND=DEEPGEMM, MTP=3 (or MTP=1 when DP_ATTN=true)
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # If CONC >= 64, then DP_ATTN=true, MTP=1
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp }
+  - isl: 1024
+    osl: 8192
+    search-space:
+    # If CONC >= 128, then DP_ATTN=true, MTP=1
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # If CONC >= 64, then DP_ATTN=true, MTP=1
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp }
+
 dsr1-fp8-h200-sglang:
   image: lmsysorg/sglang:v0.5.6-cu129-amd64
   model: deepseek-ai/DeepSeek-R1-0528
@@ -172,6 +252,35 @@ dsr1-fp8-h200-trt:
     - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64 }
 
+dsr1-fp8-h200-trt-mtp:
+  image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: h200
+  precision: fp8
+  framework: trt
+  multinode: false
+  # For all sequence lengths, EP=TP, MOE_BACKEND=CUTLASS, MTP=3 (or MTP=1 when DP_ATTN=true)
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # If CONC >= 128, then DP_ATTN=true, MTP=1
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
+  - isl: 1024
+    osl: 8192
+    search-space:
+    # If CONC >= 256, then DP_ATTN=true, MTP=1
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 128, spec-decoding: mtp }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256, spec-decoding: mtp }
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # If CONC >= 64, then DP_ATTN=true, MTP=1
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp }
+
 gptoss-fp4-b200-trt:
   image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc2
   model: openai/gpt-oss-120b

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
@@ -92,7 +92,7 @@ wait_for_server_ready() {
 }
 
 # Run benchmark serving with standardized parameters
-# All parameters are required
+# All parameters are required except --use-chat-template
 # Parameters:
 #   --model: Model name
 #   --port: Server port
@@ -104,6 +104,7 @@ wait_for_server_ready() {
 #   --max-concurrency: Max concurrency
 #   --result-filename: Result filename without extension
 #   --result-dir: Result directory
+#   --use-chat-template: Optional flag to enable chat template
 run_benchmark_serving() {
     set +x
     local model=""
@@ -116,6 +117,7 @@ run_benchmark_serving() {
     local max_concurrency=""
     local result_filename=""
     local result_dir=""
+    local use_chat_template=false
 
     # Parse arguments
     while [[ $# -gt 0 ]]; do
@@ -160,6 +162,10 @@ run_benchmark_serving() {
                 result_dir="$2"
                 shift 2
                 ;;
+            --use-chat-template)
+                use_chat_template=true
+                shift
+                ;;
             *)
                 echo "Unknown parameter: $1"
                 return 1
@@ -224,23 +230,33 @@ run_benchmark_serving() {
     local BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
     git clone https://github.com/kimbochen/bench_serving.git "$BENCH_SERVING_DIR"
 
+    # Build benchmark command
+    local benchmark_cmd=(
+        python3 "$BENCH_SERVING_DIR/benchmark_serving.py"
+        --model "$model"
+        --backend "$backend"
+        --base-url "http://0.0.0.0:$port"
+        --dataset-name random
+        --random-input-len "$input_len"
+        --random-output-len "$output_len"
+        --random-range-ratio "$random_range_ratio"
+        --num-prompts "$num_prompts"
+        --max-concurrency "$max_concurrency"
+        --request-rate inf
+        --ignore-eos
+        --save-result
+        --percentile-metrics 'ttft,tpot,itl,e2el'
+        --result-dir "$result_dir"
+        --result-filename "$result_filename.json"
+    )
+
+    # Add --use-chat-template if requested
+    if [[ "$use_chat_template" == true ]]; then
+        benchmark_cmd+=(--use-chat-template)
+    fi
+
     # Run benchmark
     set -x
-    python3 "$BENCH_SERVING_DIR/benchmark_serving.py" \
-        --model "$model" \
-        --backend "$backend" \
-        --base-url "http://0.0.0.0:$port" \
-        --dataset-name random \
-        --random-input-len "$input_len" \
-        --random-output-len "$output_len" \
-        --random-range-ratio "$random_range_ratio" \
-        --num-prompts "$num_prompts" \
-        --max-concurrency "$max_concurrency" \
-        --request-rate inf \
-        --ignore-eos \
-        --save-result \
-        --percentile-metrics 'ttft,tpot,itl,e2el' \
-        --result-dir "$result_dir" \
-        --result-filename "$result_filename.json"
+    "${benchmark_cmd[@]}"
     set +x
 }
diff --git a/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh
@@ -0,0 +1,103 @@
+#!/usr/bin/env bash
+
+# === Required Env Vars ===
+# MODEL
+# TP
+# CONC
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# RESULT_FILENAME
+# PORT_OFFSET
+# DP_ATTENTION
+# EP_SIZE
+
+echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
+
+hf download $MODEL
+
+# ========= Determine MOE_BACKEND and MTP based on DP_ATTENTION =========
+if [[ "$DP_ATTENTION" == "true" ]]; then
+    MOE_BACKEND="CUTLASS"
+    MTP=1
+else
+    MOE_BACKEND="TRTLLM"
+    MTP=3
+fi
+
+echo "MOE_BACKEND='$MOE_BACKEND', MTP='$MTP'"
+
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+PORT=$(( 8888 + $PORT_OFFSET ))
+EXTRA_CONFIG_FILE="dsr1-fp4-mtp.yml"
+
+cat > $EXTRA_CONFIG_FILE << EOF
+cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 512
+enable_attention_dp: $DP_ATTENTION
+print_iter_log: true
+kv_cache_config:
+    dtype: fp8
+    free_gpu_memory_fraction: 0.8
+    enable_block_reuse: false 
+stream_interval: 10
+moe_config:
+    backend: $MOE_BACKEND
+speculative_config:
+    decoding_type: MTP
+    num_nextn_predict_layers: ${MTP}
+EOF
+
+if [[ "$DP_ATTENTION" == "true" ]]; then
+    cat << EOF >> $EXTRA_CONFIG_FILE
+attention_dp_config:
+    batching_wait_iters: 0
+    enable_balance: true
+    timeout_iters: 60
+EOF
+fi
+
+if [[ "$DP_ATTENTION" == "true" ]]; then
+    MAX_BATCH_SIZE=$((CONC/TP))
+else
+    MAX_BATCH_SIZE=$CONC
+fi
+
+MAX_NUM_TOKENS=$(( ((MTP+1)*MAX_BATCH_SIZE+ISL+64+63)/64*64 ))
+
+set -x
+# Launch TRT-LLM server
+mpirun -n 1 --oversubscribe --allow-run-as-root \
+    trtllm-serve $MODEL --port=$PORT \
+    --trust_remote_code \
+    --backend=pytorch \
+    --max_batch_size=$MAX_BATCH_SIZE \
+    --max_seq_len=$MAX_MODEL_LEN \
+    --max_num_tokens=$MAX_NUM_TOKENS \
+    --tp_size=$TP --ep_size=$EP_SIZE \
+    --extra_llm_api_options=$EXTRA_CONFIG_FILE \
+    > $SERVER_LOG 2>&1 &
+
+SERVER_PID=$!
+
+# Source benchmark utilities
+source "$(dirname "$0")/benchmark_lib.sh"
+
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend openai \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts $(( $CONC * 10 )) \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/