Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
6ce8779
Add refactored MTP benchmarks for dsr1 TRT
lishicheng1996-nv Jan 7, 2026
92eca1a
Add --use-chat-template support for MTP benchmarks
lishicheng1996-nv Jan 7, 2026
fddf14e
Add MTP benchmark configurations to nvidia-master.yaml
lishicheng1996-nv Jan 7, 2026
59b38b9
Refactor MTP benchmarks to receive EP_SIZE and DP_ATTENTION from env …
lishicheng1996-nv Jan 7, 2026
f385b80
Fix MTP benchmark configurations to match original script logic
lishicheng1996-nv Jan 7, 2026
c5f4550
Align MTP conc ranges to powers of 2
lishicheng1996-nv Jan 7, 2026
dd2a82e
Fix conc range overlaps in dsr1-fp4-b200-trt-mtp
lishicheng1996-nv Jan 7, 2026
b5c542b
larger h200 concurrency
lishicheng1996-nv Jan 7, 2026
cdebd62
fix runner
lishicheng1996-nv Jan 7, 2026
835c156
fix typo
lishicheng1996-nv Jan 7, 2026
f879ca4
fix h200 runner
lishicheng1996-nv Jan 7, 2026
a5f5ebf
fix h200 runner
lishicheng1996-nv Jan 7, 2026
92fe872
Add MTP support for single-node TRT configs and launch Scripts
Ankur-singh Jan 7, 2026
48f17a7
Add MTP configs to perf-changelog
Ankur-singh Jan 7, 2026
4b9fc2b
Merge branch 'main' into kepotdar-shicli-dsr1-trt-mtp-refactor
Ankur-singh Jan 7, 2026
27834e0
Merge branch 'main' into kepotdar-shicli-dsr1-trt-mtp-refactor
Ankur-singh Jan 7, 2026
71f5d4a
fix perf-changelog
Ankur-singh Jan 7, 2026
ba1a206
fix H200 config
lishicheng1996-nv Jan 8, 2026
d937147
Merge branch 'main' into kepotdar-shicli-dsr1-trt-mtp-refactor
Ankur-singh Jan 8, 2026
b1f04df
fix per-changelog
Ankur-singh Jan 8, 2026
dae544a
no chat template
Ankur-singh Jan 12, 2026
055084f
update perf-changelog
Ankur-singh Jan 12, 2026
5f34e19
Merge branch 'main' into dsr1-trt-mtp-no-chat-template
Ankur-singh Jan 12, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 109 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,57 @@ dsr1-fp4-b200-trt:
- { tp: 8, conc-start: 4, conc-end: 32 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 }

dsr1-fp4-b200-trt-mtp:
image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
model: nvidia/DeepSeek-R1-0528-FP4-V2
model-prefix: dsr1
runner: b200-trt
precision: fp4
framework: trt
multinode: false
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
# If TP=4:
# If CONC >= 16, then EP=4
# If CONC >= 128, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
- { tp: 4, conc-start: 4, conc-end: 8, spec-decoding: mtp }
- { tp: 4, ep: 4, conc-start: 16, conc-end: 64, spec-decoding: mtp }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
# If TP=8:
# If CONC >= 16, then EP=8
# If CONC >= 64, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
- { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp }
- { tp: 8, ep: 8, conc-start: 16, conc-end: 32, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp }
- isl: 1024
osl: 8192
search-space:
# If TP=4:
# If CONC >= 32, then EP=4
# If CONC >= 128, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
- { tp: 4, conc-start: 4, conc-end: 16, spec-decoding: mtp }
- { tp: 4, ep: 4, conc-start: 32, conc-end: 64, spec-decoding: mtp }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
# If TP=8:
# If CONC >= 8, then EP=8
# If CONC >= 128, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
- { tp: 8, conc-start: 4, conc-end: 4, spec-decoding: mtp }
- { tp: 8, ep: 8, conc-start: 8, conc-end: 64, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
- isl: 8192
osl: 1024
search-space:
# If TP=4:
# If CONC >= 32, then EP=4, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
- { tp: 4, conc-start: 4, conc-end: 16, spec-decoding: mtp }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 256, spec-decoding: mtp }
# If TP=8:
# If CONC >= 32, then EP=8, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1
- { tp: 8, conc-start: 4, conc-end: 16, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256, spec-decoding: mtp }

dsr1-fp8-b200-sglang:
image: lmsysorg/sglang:v0.5.6-cu129-amd64
model: deepseek-ai/DeepSeek-R1-0528
Expand Down Expand Up @@ -123,6 +174,35 @@ dsr1-fp8-b200-trt:
# If CONC > 64, then DP_ATTN=true
- { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }

dsr1-fp8-b200-trt-mtp:
image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
model: deepseek-ai/DeepSeek-R1-0528
model-prefix: dsr1
runner: b200-trt
precision: fp8
framework: trt
multinode: false
seq-len-configs:
# For all sequence lengths, EP=TP, MOE_BACKEND=DEEPGEMM, MTP=3 (or MTP=1 when DP_ATTN=true)
- isl: 1024
osl: 1024
search-space:
# If CONC >= 64, then DP_ATTN=true, MTP=1
- { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp }
- isl: 1024
osl: 8192
search-space:
# If CONC >= 128, then DP_ATTN=true, MTP=1
- { tp: 8, ep: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
- isl: 8192
osl: 1024
search-space:
# If CONC >= 64, then DP_ATTN=true, MTP=1
- { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp }

dsr1-fp8-h200-sglang:
image: lmsysorg/sglang:v0.5.6-cu129-amd64
model: deepseek-ai/DeepSeek-R1-0528
Expand Down Expand Up @@ -172,6 +252,35 @@ dsr1-fp8-h200-trt:
- { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64 }

dsr1-fp8-h200-trt-mtp:
image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
model: deepseek-ai/DeepSeek-R1-0528
model-prefix: dsr1
runner: h200
precision: fp8
framework: trt
multinode: false
# For all sequence lengths, EP=TP, MOE_BACKEND=CUTLASS, MTP=3 (or MTP=1 when DP_ATTN=true)
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
# If CONC >= 128, then DP_ATTN=true, MTP=1
- { tp: 8, ep: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
- isl: 1024
osl: 8192
search-space:
# If CONC >= 256, then DP_ATTN=true, MTP=1
- { tp: 8, ep: 8, conc-start: 4, conc-end: 128, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256, spec-decoding: mtp }
- isl: 8192
osl: 1024
search-space:
# If CONC >= 64, then DP_ATTN=true, MTP=1
- { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp }

gptoss-fp4-b200-trt:
image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc2
model: openai/gpt-oss-120b
Expand Down
50 changes: 33 additions & 17 deletions benchmarks/benchmark_lib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ wait_for_server_ready() {
}

# Run benchmark serving with standardized parameters
# All parameters are required
# All parameters are required except --use-chat-template
# Parameters:
# --model: Model name
# --port: Server port
Expand All @@ -104,6 +104,7 @@ wait_for_server_ready() {
# --max-concurrency: Max concurrency
# --result-filename: Result filename without extension
# --result-dir: Result directory
# --use-chat-template: Optional flag to enable chat template
run_benchmark_serving() {
set +x
local model=""
Expand All @@ -116,6 +117,7 @@ run_benchmark_serving() {
local max_concurrency=""
local result_filename=""
local result_dir=""
local use_chat_template=false

# Parse arguments
while [[ $# -gt 0 ]]; do
Expand Down Expand Up @@ -160,6 +162,10 @@ run_benchmark_serving() {
result_dir="$2"
shift 2
;;
--use-chat-template)
use_chat_template=true
shift
;;
*)
echo "Unknown parameter: $1"
return 1
Expand Down Expand Up @@ -224,23 +230,33 @@ run_benchmark_serving() {
local BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
git clone https://github.com/kimbochen/bench_serving.git "$BENCH_SERVING_DIR"

# Build benchmark command
local benchmark_cmd=(
python3 "$BENCH_SERVING_DIR/benchmark_serving.py"
--model "$model"
--backend "$backend"
--base-url "http://0.0.0.0:$port"
--dataset-name random
--random-input-len "$input_len"
--random-output-len "$output_len"
--random-range-ratio "$random_range_ratio"
--num-prompts "$num_prompts"
--max-concurrency "$max_concurrency"
--request-rate inf
--ignore-eos
--save-result
--percentile-metrics 'ttft,tpot,itl,e2el'
--result-dir "$result_dir"
--result-filename "$result_filename.json"
)

# Add --use-chat-template if requested
if [[ "$use_chat_template" == true ]]; then
benchmark_cmd+=(--use-chat-template)
fi

# Run benchmark
set -x
python3 "$BENCH_SERVING_DIR/benchmark_serving.py" \
--model "$model" \
--backend "$backend" \
--base-url "http://0.0.0.0:$port" \
--dataset-name random \
--random-input-len "$input_len" \
--random-output-len "$output_len" \
--random-range-ratio "$random_range_ratio" \
--num-prompts "$num_prompts" \
--max-concurrency "$max_concurrency" \
--request-rate inf \
--ignore-eos \
--save-result \
--percentile-metrics 'ttft,tpot,itl,e2el' \
--result-dir "$result_dir" \
--result-filename "$result_filename.json"
"${benchmark_cmd[@]}"
set +x
}
103 changes: 103 additions & 0 deletions benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
#!/usr/bin/env bash

# === Required Env Vars ===
# MODEL
# TP
# CONC
# ISL
# OSL
# MAX_MODEL_LEN
# RANDOM_RANGE_RATIO
# RESULT_FILENAME
# PORT_OFFSET
# DP_ATTENTION
# EP_SIZE

echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"

echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"

hf download $MODEL

# ========= Determine MOE_BACKEND and MTP based on DP_ATTENTION =========
if [[ "$DP_ATTENTION" == "true" ]]; then
MOE_BACKEND="CUTLASS"
MTP=1
else
MOE_BACKEND="TRTLLM"
MTP=3
fi

echo "MOE_BACKEND='$MOE_BACKEND', MTP='$MTP'"

SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
PORT=$(( 8888 + $PORT_OFFSET ))
EXTRA_CONFIG_FILE="dsr1-fp4-mtp.yml"

cat > $EXTRA_CONFIG_FILE << EOF
cuda_graph_config:
enable_padding: true
max_batch_size: 512
enable_attention_dp: $DP_ATTENTION
print_iter_log: true
kv_cache_config:
dtype: fp8
free_gpu_memory_fraction: 0.8
enable_block_reuse: false
stream_interval: 10
moe_config:
backend: $MOE_BACKEND
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: ${MTP}
EOF

if [[ "$DP_ATTENTION" == "true" ]]; then
cat << EOF >> $EXTRA_CONFIG_FILE
attention_dp_config:
batching_wait_iters: 0
enable_balance: true
timeout_iters: 60
EOF
fi

if [[ "$DP_ATTENTION" == "true" ]]; then
MAX_BATCH_SIZE=$((CONC/TP))
else
MAX_BATCH_SIZE=$CONC
fi

MAX_NUM_TOKENS=$(( ((MTP+1)*MAX_BATCH_SIZE+ISL+64+63)/64*64 ))

set -x
# Launch TRT-LLM server
mpirun -n 1 --oversubscribe --allow-run-as-root \
trtllm-serve $MODEL --port=$PORT \
--trust_remote_code \
--backend=pytorch \
--max_batch_size=$MAX_BATCH_SIZE \
--max_seq_len=$MAX_MODEL_LEN \
--max_num_tokens=$MAX_NUM_TOKENS \
--tp_size=$TP --ep_size=$EP_SIZE \
--extra_llm_api_options=$EXTRA_CONFIG_FILE \
> $SERVER_LOG 2>&1 &

SERVER_PID=$!

# Source benchmark utilities
source "$(dirname "$0")/benchmark_lib.sh"

# Wait for server to be ready
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend openai \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts $(( $CONC * 10 )) \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/
Loading
Loading