From 6ce87790fbac9e44ec41ab990f2b8cffe816e063 Mon Sep 17 00:00:00 2001 From: shicli Date: Wed, 7 Jan 2026 10:42:37 +0800 Subject: [PATCH 01/19] Add refactored MTP benchmarks for dsr1 TRT - Add dsr1_fp4_b200_trt_mtp_slurm.sh with MTP support - Add dsr1_fp8_b200_trt_mtp_slurm.sh with MTP support - Add dsr1_fp8_h200_trt_mtp_slurm.sh with MTP support - Refactored to use benchmark_lib.sh utilities - Use wait_for_server_ready and run_benchmark_serving functions --- benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh | 154 ++++++++++++++++++++++ benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh | 115 ++++++++++++++++ benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh | 110 ++++++++++++++++ 3 files changed, 379 insertions(+) create mode 100644 benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh create mode 100644 benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh create mode 100644 benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh diff --git a/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh new file mode 100644 index 000000000..8200e27a7 --- /dev/null +++ b/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh @@ -0,0 +1,154 @@ +#!/usr/bin/env bash + +# === Required Env Vars === +# MODEL +# TP +# CONC +# ISL +# OSL +# MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# RESULT_FILENAME +# PORT_OFFSET + +echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" + +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" + +hf download $MODEL + +# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC, TP ========= +EP_SIZE="1" +MOE_BACKEND="TRTLLM" +DP_ATTENTION=false +MTP=3 + +if [[ "$TP" == "4" ]]; then + if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then + if [[ $CONC -ge 16 ]]; then + EP_SIZE="$TP" + fi + if [[ $CONC -ge 128 ]]; then + DP_ATTENTION=true + MOE_BACKEND="CUTLASS" + MTP=1 + fi + elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then + if [[ $CONC -ge 32 ]]; then + EP_SIZE="$TP" + fi + if [[ $CONC -ge 128 ]]; then + DP_ATTENTION=true + MOE_BACKEND="CUTLASS" + MTP=1 + fi + elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then + if [[ $CONC -ge 32 ]]; then + EP_SIZE="$TP" + DP_ATTENTION=true + MOE_BACKEND="CUTLASS" + MTP=1 + fi + fi +elif [[ "$TP" == "8" ]]; then + if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then + if [[ $CONC -ge 16 ]]; then + EP_SIZE="$TP" + fi + if [[ $CONC -ge 64 ]]; then + DP_ATTENTION=true + MOE_BACKEND="CUTLASS" + MTP=1 + fi + elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then + if [[ $CONC -ge 8 ]]; then + EP_SIZE="$TP" + fi + if [[ $CONC -ge 128 ]]; then + DP_ATTENTION=true + MOE_BACKEND="CUTLASS" + MTP=1 + fi + elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then + if [[ $CONC -ge 32 ]]; then + EP_SIZE="$TP" + DP_ATTENTION=true + MOE_BACKEND="CUTLASS" + MTP=1 + fi + fi +fi + +echo "EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION', MTP='$MTP'" + +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +PORT=$(( 8888 + $PORT_OFFSET )) +EXTRA_CONFIG_FILE="dsr1-fp4-mtp.yml" + +cat > $EXTRA_CONFIG_FILE << EOF +cuda_graph_config: + enable_padding: true + max_batch_size: 512 +enable_attention_dp: $DP_ATTENTION +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: $MOE_BACKEND +speculative_config: + decoding_type: MTP + num_nextn_predict_layers: ${MTP} +EOF + +if [[ "$DP_ATTENTION" == "true" ]]; then + cat << EOF >> $EXTRA_CONFIG_FILE +attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 +EOF +fi + +if [[ "$DP_ATTENTION" == "true" ]]; then + MAX_BATCH_SIZE=$((CONC/TP)) +else + MAX_BATCH_SIZE=$CONC +fi + +MAX_NUM_TOKENS=$(( ((MTP+1)*MAX_BATCH_SIZE+ISL+64+63)/64*64 )) + +set -x +# Launch TRT-LLM server +mpirun -n 1 --oversubscribe --allow-run-as-root \ + trtllm-serve $MODEL --port=$PORT \ + --trust_remote_code \ + --backend=pytorch \ + --max_batch_size=$MAX_BATCH_SIZE \ + --max_seq_len=$MAX_MODEL_LEN \ + --max_num_tokens=$MAX_NUM_TOKENS \ + --tp_size=$TP --ep_size=$EP_SIZE \ + --extra_llm_api_options=$EXTRA_CONFIG_FILE \ + > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend openai \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh new file mode 100644 index 000000000..8a6795edc --- /dev/null +++ b/benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh @@ -0,0 +1,115 @@ +#!/usr/bin/env bash + +# === Required Env Vars === +# MODEL +# TP +# CONC +# ISL +# OSL +# MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# RESULT_FILENAME +# PORT_OFFSET + +echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" + +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" + +hf download $MODEL + +# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC ========= +EP_SIZE="$TP" +MOE_BACKEND="DEEPGEMM" +DP_ATTENTION=false +MTP=3 + +if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then + if [[ $CONC -ge 64 ]]; then + DP_ATTENTION=true + MTP=1 + fi +elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then + if [[ $CONC -ge 128 ]]; then + DP_ATTENTION=true + MTP=1 + fi +elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then + if [[ $CONC -ge 64 ]]; then + DP_ATTENTION=true + MTP=1 + fi +fi + +echo "EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION', MTP='$MTP'" + +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +PORT=$(( 8888 + $PORT_OFFSET )) +EXTRA_CONFIG_FILE="dsr1-fp8-mtp.yml" + +cat > $EXTRA_CONFIG_FILE << EOF +cuda_graph_config: + enable_padding: true + max_batch_size: 256 +enable_attention_dp: $DP_ATTENTION +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: $MOE_BACKEND +speculative_config: + decoding_type: MTP + num_nextn_predict_layers: ${MTP} +EOF + +if [[ "$DP_ATTENTION" == "true" ]]; then + cat << EOF >> $EXTRA_CONFIG_FILE +attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 +EOF +fi + +if [[ "$DP_ATTENTION" == "true" ]]; then + MAX_BATCH_SIZE=$((CONC/TP)) +else + MAX_BATCH_SIZE=$CONC +fi + +MAX_NUM_TOKENS=$(( ((MTP+1)*MAX_BATCH_SIZE+ISL+64+63)/64*64 )) + +set -x +# Launch TRT-LLM server +mpirun -n 1 --oversubscribe --allow-run-as-root \ + trtllm-serve $MODEL --port=$PORT \ + --trust_remote_code \ + --backend=pytorch \ + --max_batch_size=$MAX_BATCH_SIZE \ + --max_seq_len=$MAX_MODEL_LEN \ + --max_num_tokens=$MAX_NUM_TOKENS \ + --tp_size=$TP --ep_size=$EP_SIZE \ + --extra_llm_api_options=$EXTRA_CONFIG_FILE \ + > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend openai \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh new file mode 100644 index 000000000..065ab7603 --- /dev/null +++ b/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh @@ -0,0 +1,110 @@ +#!/usr/bin/env bash + +# === Required Env Vars === +# MODEL +# TP +# CONC +# ISL +# OSL +# MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# RESULT_FILENAME +# PORT_OFFSET + +echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" + +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" + +hf download $MODEL + +# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC ========= +EP_SIZE="$TP" +MOE_BACKEND="CUTLASS" +DP_ATTENTION=false +MTP=3 + +if [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then + if [[ $CONC -ge 256 ]]; then + DP_ATTENTION=true + MTP=1 + fi +elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then + if [[ $CONC -ge 64 ]]; then + DP_ATTENTION=true + MTP=1 + fi +fi + +echo "EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION', MTP='$MTP'" + +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +PORT=$(( 8888 + $PORT_OFFSET )) +EXTRA_CONFIG_FILE="dsr1-fp8-mtp.yml" + +cat > $EXTRA_CONFIG_FILE << EOF +cuda_graph_config: + enable_padding: true + max_batch_size: 128 +enable_attention_dp: $DP_ATTENTION +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.75 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: $MOE_BACKEND +speculative_config: + decoding_type: MTP + num_nextn_predict_layers: ${MTP} +EOF + +if [[ "$DP_ATTENTION" == "true" ]]; then + cat << EOF >> $EXTRA_CONFIG_FILE +attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 +EOF +fi + +if [[ "$DP_ATTENTION" == "true" ]]; then + MAX_BATCH_SIZE=$((CONC/TP)) +else + MAX_BATCH_SIZE=$CONC +fi + +MAX_NUM_TOKENS=$(( ((MTP+1)*MAX_BATCH_SIZE+ISL+64+63)/64*64 )) + +set -x +# Launch TRT-LLM server +PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \ + trtllm-serve $MODEL --port=$PORT \ + --trust_remote_code \ + --backend=pytorch \ + --max_batch_size=$MAX_BATCH_SIZE \ + --max_seq_len=$MAX_MODEL_LEN \ + --max_num_tokens=$MAX_NUM_TOKENS \ + --tp_size=$TP --ep_size=$EP_SIZE \ + --extra_llm_api_options=$EXTRA_CONFIG_FILE \ + > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend openai \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ From 92eca1a79e34e11afc34e01bd898ba69d3e5e3a0 Mon Sep 17 00:00:00 2001 From: shicli Date: Wed, 7 Jan 2026 10:50:07 +0800 Subject: [PATCH 02/19] Add --use-chat-template support for MTP benchmarks - Extended benchmark_lib.sh run_benchmark_serving() to support optional --use-chat-template flag - Added --use-chat-template to all three MTP benchmark scripts - This is required for MTP mode to work correctly --- benchmarks/benchmark_lib.sh | 50 +++++++++++++++-------- benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh | 3 +- benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh | 3 +- benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh | 3 +- 4 files changed, 39 insertions(+), 20 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index cc7b81553..fad1c5064 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -92,7 +92,7 @@ wait_for_server_ready() { } # Run benchmark serving with standardized parameters -# All parameters are required +# All parameters are required except --use-chat-template # Parameters: # --model: Model name # --port: Server port @@ -104,6 +104,7 @@ wait_for_server_ready() { # --max-concurrency: Max concurrency # --result-filename: Result filename without extension # --result-dir: Result directory +# --use-chat-template: Optional flag to enable chat template run_benchmark_serving() { set +x local model="" @@ -116,6 +117,7 @@ run_benchmark_serving() { local max_concurrency="" local result_filename="" local result_dir="" + local use_chat_template=false # Parse arguments while [[ $# -gt 0 ]]; do @@ -160,6 +162,10 @@ run_benchmark_serving() { result_dir="$2" shift 2 ;; + --use-chat-template) + use_chat_template=true + shift + ;; *) echo "Unknown parameter: $1" return 1 @@ -224,23 +230,33 @@ run_benchmark_serving() { local BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) git clone https://github.com/kimbochen/bench_serving.git "$BENCH_SERVING_DIR" + # Build benchmark command + local benchmark_cmd=( + python3 "$BENCH_SERVING_DIR/benchmark_serving.py" + --model "$model" + --backend "$backend" + --base-url "http://0.0.0.0:$port" + --dataset-name random + --random-input-len "$input_len" + --random-output-len "$output_len" + --random-range-ratio "$random_range_ratio" + --num-prompts "$num_prompts" + --max-concurrency "$max_concurrency" + --request-rate inf + --ignore-eos + --save-result + --percentile-metrics 'ttft,tpot,itl,e2el' + --result-dir "$result_dir" + --result-filename "$result_filename.json" + ) + + # Add --use-chat-template if requested + if [[ "$use_chat_template" == true ]]; then + benchmark_cmd+=(--use-chat-template) + fi + # Run benchmark set -x - python3 "$BENCH_SERVING_DIR/benchmark_serving.py" \ - --model "$model" \ - --backend "$backend" \ - --base-url "http://0.0.0.0:$port" \ - --dataset-name random \ - --random-input-len "$input_len" \ - --random-output-len "$output_len" \ - --random-range-ratio "$random_range_ratio" \ - --num-prompts "$num_prompts" \ - --max-concurrency "$max_concurrency" \ - --request-rate inf \ - --ignore-eos \ - --save-result \ - --percentile-metrics 'ttft,tpot,itl,e2el' \ - --result-dir "$result_dir" \ - --result-filename "$result_filename.json" + "${benchmark_cmd[@]}" set +x } diff --git a/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh index 8200e27a7..15160202b 100644 --- a/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh +++ b/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh @@ -151,4 +151,5 @@ run_benchmark_serving \ --num-prompts $(( $CONC * 10 )) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ + --result-dir /workspace/ \ + --use-chat-template diff --git a/benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh index 8a6795edc..41224b696 100644 --- a/benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh +++ b/benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh @@ -112,4 +112,5 @@ run_benchmark_serving \ --num-prompts $(( $CONC * 10 )) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ + --result-dir /workspace/ \ + --use-chat-template diff --git a/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh index 065ab7603..99d2e3d20 100644 --- a/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh @@ -107,4 +107,5 @@ run_benchmark_serving \ --num-prompts $(( $CONC * 10 )) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ + --result-dir /workspace/ \ + --use-chat-template From fddf14edf99b544d83680307bd6c9b99f0be512c Mon Sep 17 00:00:00 2001 From: shicli Date: Wed, 7 Jan 2026 11:10:45 +0800 Subject: [PATCH 03/19] Add MTP benchmark configurations to nvidia-master.yaml - Add dsr1-fp4-b200-trt-mtp configuration with EP/DP_ATTN/MTP logic - Add dsr1-fp8-b200-trt-mtp configuration with EP/DP_ATTN/MTP logic - Add dsr1-fp8-h200-trt-mtp configuration with EP/DP_ATTN/MTP logic - Configurations align with benchmark script logic for dynamic EP_SIZE, MOE_BACKEND, and MTP values --- .github/configs/nvidia-master.yaml | 108 +++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 34d85fcca..4b72cc498 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -74,6 +74,57 @@ dsr1-fp4-b200-trt: - { tp: 8, conc-start: 4, conc-end: 32 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 } +dsr1-fp4-b200-trt-mtp: + image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 + model: nvidia/DeepSeek-R1-0528-FP4-V2 + model-prefix: dsr1 + runner: b200-trt-mtp + precision: fp4 + framework: trt + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # If TP=4: + # If CONC >= 16, then EP=4 + # If CONC >= 128, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1 + - { tp: 4, conc-start: 4, conc-end: 8 } + - { tp: 4, ep: 4, conc-start: 16, conc-end: 64 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 256 } + # If TP=8: + # If CONC >= 16, then EP=8 + # If CONC >= 64, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1 + - { tp: 8, conc-start: 4, conc-end: 8 } + - { tp: 8, ep: 8, conc-start: 16, conc-end: 32 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 } + - isl: 1024 + osl: 8192 + search-space: + # If TP=4: + # If CONC >= 32, then EP=4 + # If CONC >= 128, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1 + - { tp: 4, conc-start: 4, conc-end: 16 } + - { tp: 4, ep: 4, conc-start: 32, conc-end: 64 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 256 } + # If TP=8: + # If CONC >= 8, then EP=8 + # If CONC >= 128, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1 + - { tp: 8, conc-start: 4, conc-end: 4 } + - { tp: 8, ep: 8, conc-start: 8, conc-end: 64 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + # If TP=4: + # If CONC >= 32, then EP=4, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1 + - { tp: 4, conc-start: 4, conc-end: 16 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 256 } + # If TP=8: + # If CONC >= 32, then EP=8, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1 + - { tp: 8, conc-start: 4, conc-end: 16 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256 } + dsr1-fp8-b200-sglang: image: lmsysorg/sglang:v0.5.6-cu129-amd64 model: deepseek-ai/DeepSeek-R1-0528 @@ -123,6 +174,35 @@ dsr1-fp8-b200-trt: # If CONC > 64, then DP_ATTN=true - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } +dsr1-fp8-b200-trt-mtp: + image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: b200-trt-mtp + precision: fp8 + framework: trt + multinode: false + seq-len-configs: + # For all sequence lengths, EP=TP, MOE_BACKEND=DEEPGEMM, MTP=3 (or MTP=1 when DP_ATTN=true) + - isl: 1024 + osl: 1024 + search-space: + # If CONC >= 64, then DP_ATTN=true, MTP=1 + - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 } + - isl: 1024 + osl: 8192 + search-space: + # If CONC >= 128, then DP_ATTN=true, MTP=1 + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + # If CONC >= 64, then DP_ATTN=true, MTP=1 + - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 } + dsr1-fp8-h200-sglang: image: lmsysorg/sglang:v0.5.6-cu129-amd64 model: deepseek-ai/DeepSeek-R1-0528 @@ -172,6 +252,34 @@ dsr1-fp8-h200-trt: - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64 } +dsr1-fp8-h200-trt-mtp: + image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: h200-trt-mtp + precision: fp8 + framework: trt + multinode: false + # For all sequence lengths, EP=TP, MOE_BACKEND=CUTLASS, MTP=3 (or MTP=1 when DP_ATTN=true) + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # MTP=3 for all, no DP_ATTN in this sequence length + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + search-space: + # If CONC >= 256, then DP_ATTN=true, MTP=1 + - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + # If CONC >= 64, then DP_ATTN=true, MTP=1 + - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 } + gptoss-fp4-b200-trt: image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc2 model: openai/gpt-oss-120b From 59b38b984859504c819c0c262084343db1713c60 Mon Sep 17 00:00:00 2001 From: shicli Date: Wed, 7 Jan 2026 11:30:27 +0800 Subject: [PATCH 04/19] Refactor MTP benchmarks to receive EP_SIZE and DP_ATTENTION from env vars - Remove duplicate EP_SIZE/DP_ATTENTION calculation logic from MTP scripts - MTP scripts now receive EP_SIZE and DP_ATTENTION as env vars from YAML config (like non-MTP scripts) - Only calculate MOE_BACKEND and MTP values based on DP_ATTENTION flag - Simplifies scripts from 156/117/112 lines to 104 lines each - Eliminates redundant logic between YAML configs and bash scripts --- benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh | 73 ++++------------------- benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh | 32 ++++------ benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh | 27 ++++----- 3 files changed, 31 insertions(+), 101 deletions(-) diff --git a/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh index 15160202b..33d819efa 100644 --- a/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh +++ b/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh @@ -10,76 +10,25 @@ # RANDOM_RANGE_RATIO # RESULT_FILENAME # PORT_OFFSET +# DP_ATTENTION +# EP_SIZE echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" hf download $MODEL -# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC, TP ========= -EP_SIZE="1" -MOE_BACKEND="TRTLLM" -DP_ATTENTION=false -MTP=3 - -if [[ "$TP" == "4" ]]; then - if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - if [[ $CONC -ge 16 ]]; then - EP_SIZE="$TP" - fi - if [[ $CONC -ge 128 ]]; then - DP_ATTENTION=true - MOE_BACKEND="CUTLASS" - MTP=1 - fi - elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - if [[ $CONC -ge 32 ]]; then - EP_SIZE="$TP" - fi - if [[ $CONC -ge 128 ]]; then - DP_ATTENTION=true - MOE_BACKEND="CUTLASS" - MTP=1 - fi - elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then - if [[ $CONC -ge 32 ]]; then - EP_SIZE="$TP" - DP_ATTENTION=true - MOE_BACKEND="CUTLASS" - MTP=1 - fi - fi -elif [[ "$TP" == "8" ]]; then - if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - if [[ $CONC -ge 16 ]]; then - EP_SIZE="$TP" - fi - if [[ $CONC -ge 64 ]]; then - DP_ATTENTION=true - MOE_BACKEND="CUTLASS" - MTP=1 - fi - elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - if [[ $CONC -ge 8 ]]; then - EP_SIZE="$TP" - fi - if [[ $CONC -ge 128 ]]; then - DP_ATTENTION=true - MOE_BACKEND="CUTLASS" - MTP=1 - fi - elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then - if [[ $CONC -ge 32 ]]; then - EP_SIZE="$TP" - DP_ATTENTION=true - MOE_BACKEND="CUTLASS" - MTP=1 - fi - fi +# ========= Determine MOE_BACKEND and MTP based on DP_ATTENTION ========= +if [[ "$DP_ATTENTION" == "true" ]]; then + MOE_BACKEND="CUTLASS" + MTP=1 +else + MOE_BACKEND="TRTLLM" + MTP=3 fi -echo "EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION', MTP='$MTP'" +echo "MOE_BACKEND='$MOE_BACKEND', MTP='$MTP'" SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=$(( 8888 + $PORT_OFFSET )) diff --git a/benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh index 41224b696..9054bd014 100644 --- a/benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh +++ b/benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh @@ -10,37 +10,25 @@ # RANDOM_RANGE_RATIO # RESULT_FILENAME # PORT_OFFSET +# DP_ATTENTION +# EP_SIZE echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" hf download $MODEL -# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC ========= -EP_SIZE="$TP" +# ========= Determine MOE_BACKEND and MTP based on DP_ATTENTION ========= MOE_BACKEND="DEEPGEMM" -DP_ATTENTION=false -MTP=3 - -if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - if [[ $CONC -ge 64 ]]; then - DP_ATTENTION=true - MTP=1 - fi -elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - if [[ $CONC -ge 128 ]]; then - DP_ATTENTION=true - MTP=1 - fi -elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then - if [[ $CONC -ge 64 ]]; then - DP_ATTENTION=true - MTP=1 - fi + +if [[ "$DP_ATTENTION" == "true" ]]; then + MTP=1 +else + MTP=3 fi -echo "EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION', MTP='$MTP'" +echo "MOE_BACKEND='$MOE_BACKEND', MTP='$MTP'" SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=$(( 8888 + $PORT_OFFSET )) diff --git a/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh index 99d2e3d20..b7063f395 100644 --- a/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh @@ -10,32 +10,25 @@ # RANDOM_RANGE_RATIO # RESULT_FILENAME # PORT_OFFSET +# DP_ATTENTION +# EP_SIZE echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" hf download $MODEL -# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC ========= -EP_SIZE="$TP" +# ========= Determine MOE_BACKEND and MTP based on DP_ATTENTION ========= MOE_BACKEND="CUTLASS" -DP_ATTENTION=false -MTP=3 - -if [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - if [[ $CONC -ge 256 ]]; then - DP_ATTENTION=true - MTP=1 - fi -elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then - if [[ $CONC -ge 64 ]]; then - DP_ATTENTION=true - MTP=1 - fi + +if [[ "$DP_ATTENTION" == "true" ]]; then + MTP=1 +else + MTP=3 fi -echo "EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION', MTP='$MTP'" +echo "MOE_BACKEND='$MOE_BACKEND', MTP='$MTP'" SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=$(( 8888 + $PORT_OFFSET )) From f385b80e4ec7e807c0a9486147d346d22c17c6a8 Mon Sep 17 00:00:00 2001 From: shicli Date: Wed, 7 Jan 2026 11:35:47 +0800 Subject: [PATCH 05/19] Fix MTP benchmark configurations to match original script logic - Fix dsr1-fp4-b200-trt-mtp conc ranges to match EP_SIZE conditions - Fix dsr1-fp8-b200-trt-mtp conc ranges to match DP_ATTENTION conditions - Fix dsr1-fp8-h200-trt-mtp conc ranges to match DP_ATTENTION conditions - All configurations now accurately reflect the original bash script conditional logic --- .github/configs/nvidia-master.yaml | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 4b72cc498..1c38e6827 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -89,14 +89,14 @@ dsr1-fp4-b200-trt-mtp: # If TP=4: # If CONC >= 16, then EP=4 # If CONC >= 128, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1 - - { tp: 4, conc-start: 4, conc-end: 8 } - - { tp: 4, ep: 4, conc-start: 16, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 16 } + - { tp: 4, ep: 4, conc-start: 16, conc-end: 128 } - { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 256 } # If TP=8: # If CONC >= 16, then EP=8 # If CONC >= 64, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1 - - { tp: 8, conc-start: 4, conc-end: 8 } - - { tp: 8, ep: 8, conc-start: 16, conc-end: 32 } + - { tp: 8, conc-start: 4, conc-end: 16 } + - { tp: 8, ep: 8, conc-start: 16, conc-end: 64 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 } - isl: 1024 osl: 8192 @@ -104,25 +104,25 @@ dsr1-fp4-b200-trt-mtp: # If TP=4: # If CONC >= 32, then EP=4 # If CONC >= 128, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1 - - { tp: 4, conc-start: 4, conc-end: 16 } - - { tp: 4, ep: 4, conc-start: 32, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 32 } + - { tp: 4, ep: 4, conc-start: 32, conc-end: 128 } - { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 256 } # If TP=8: # If CONC >= 8, then EP=8 # If CONC >= 128, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1 - - { tp: 8, conc-start: 4, conc-end: 4 } - - { tp: 8, ep: 8, conc-start: 8, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 8 } + - { tp: 8, ep: 8, conc-start: 8, conc-end: 128 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 } - isl: 8192 osl: 1024 search-space: # If TP=4: # If CONC >= 32, then EP=4, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1 - - { tp: 4, conc-start: 4, conc-end: 16 } + - { tp: 4, conc-start: 4, conc-end: 32 } - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 256 } # If TP=8: # If CONC >= 32, then EP=8, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1 - - { tp: 8, conc-start: 4, conc-end: 16 } + - { tp: 8, conc-start: 4, conc-end: 32 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256 } dsr1-fp8-b200-sglang: @@ -188,19 +188,19 @@ dsr1-fp8-b200-trt-mtp: osl: 1024 search-space: # If CONC >= 64, then DP_ATTN=true, MTP=1 - - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 } - isl: 1024 osl: 8192 search-space: # If CONC >= 128, then DP_ATTN=true, MTP=1 - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 } - isl: 8192 osl: 1024 search-space: # If CONC >= 64, then DP_ATTN=true, MTP=1 - - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 } dsr1-fp8-h200-sglang: @@ -271,13 +271,13 @@ dsr1-fp8-h200-trt-mtp: osl: 8192 search-space: # If CONC >= 256, then DP_ATTN=true, MTP=1 - - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256 } - isl: 8192 osl: 1024 search-space: # If CONC >= 64, then DP_ATTN=true, MTP=1 - - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 } gptoss-fp4-b200-trt: From c5f4550c13f60238b1f72d42a0d93450d966910c Mon Sep 17 00:00:00 2001 From: shicli Date: Wed, 7 Jan 2026 11:40:54 +0800 Subject: [PATCH 06/19] Align MTP conc ranges to powers of 2 - Change conc-end from 64 to 32 for dsr1-fp8-b200-trt-mtp ISL=1024/OSL=1024 - Change conc-end from 128 to 64 for dsr1-fp8-b200-trt-mtp ISL=1024/OSL=8192 - Change conc-end from 64 to 32 for dsr1-fp8-b200-trt-mtp ISL=8192/OSL=1024 - Change conc-end from 256 to 128 for dsr1-fp8-h200-trt-mtp ISL=1024/OSL=8192 - Change conc-end from 64 to 32 for dsr1-fp8-h200-trt-mtp ISL=8192/OSL=1024 - All concurrency ranges now align to powers of 2: 4, 8, 16, 32, 64, 128, 256 --- .github/configs/nvidia-master.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 1c38e6827..87b6eaf50 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -188,19 +188,19 @@ dsr1-fp8-b200-trt-mtp: osl: 1024 search-space: # If CONC >= 64, then DP_ATTN=true, MTP=1 - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 } - isl: 1024 osl: 8192 search-space: # If CONC >= 128, then DP_ATTN=true, MTP=1 - - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 } - isl: 8192 osl: 1024 search-space: # If CONC >= 64, then DP_ATTN=true, MTP=1 - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 } dsr1-fp8-h200-sglang: @@ -271,13 +271,13 @@ dsr1-fp8-h200-trt-mtp: osl: 8192 search-space: # If CONC >= 256, then DP_ATTN=true, MTP=1 - - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256 } - isl: 8192 osl: 1024 search-space: # If CONC >= 64, then DP_ATTN=true, MTP=1 - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 } gptoss-fp4-b200-trt: From dd2a82eaaf243b51b79b5b40b20e5c2ea34a23ea Mon Sep 17 00:00:00 2001 From: shicli Date: Wed, 7 Jan 2026 11:45:13 +0800 Subject: [PATCH 07/19] Fix conc range overlaps in dsr1-fp4-b200-trt-mtp - Remove overlapping boundaries between conc ranges - Change ranges to avoid overlap: 4-8, 16-64, 128-256 (with gaps at 9-15, 65-127) - All ranges now use powers of 2 boundaries without overlap - Applies to all TP/ISL/OSL combinations in dsr1-fp4-b200-trt-mtp --- .github/configs/nvidia-master.yaml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 87b6eaf50..4b72cc498 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -89,14 +89,14 @@ dsr1-fp4-b200-trt-mtp: # If TP=4: # If CONC >= 16, then EP=4 # If CONC >= 128, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1 - - { tp: 4, conc-start: 4, conc-end: 16 } - - { tp: 4, ep: 4, conc-start: 16, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 8 } + - { tp: 4, ep: 4, conc-start: 16, conc-end: 64 } - { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 256 } # If TP=8: # If CONC >= 16, then EP=8 # If CONC >= 64, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1 - - { tp: 8, conc-start: 4, conc-end: 16 } - - { tp: 8, ep: 8, conc-start: 16, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 8 } + - { tp: 8, ep: 8, conc-start: 16, conc-end: 32 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 } - isl: 1024 osl: 8192 @@ -104,25 +104,25 @@ dsr1-fp4-b200-trt-mtp: # If TP=4: # If CONC >= 32, then EP=4 # If CONC >= 128, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1 - - { tp: 4, conc-start: 4, conc-end: 32 } - - { tp: 4, ep: 4, conc-start: 32, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 16 } + - { tp: 4, ep: 4, conc-start: 32, conc-end: 64 } - { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 256 } # If TP=8: # If CONC >= 8, then EP=8 # If CONC >= 128, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1 - - { tp: 8, conc-start: 4, conc-end: 8 } - - { tp: 8, ep: 8, conc-start: 8, conc-end: 128 } + - { tp: 8, conc-start: 4, conc-end: 4 } + - { tp: 8, ep: 8, conc-start: 8, conc-end: 64 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 } - isl: 8192 osl: 1024 search-space: # If TP=4: # If CONC >= 32, then EP=4, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1 - - { tp: 4, conc-start: 4, conc-end: 32 } + - { tp: 4, conc-start: 4, conc-end: 16 } - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 256 } # If TP=8: # If CONC >= 32, then EP=8, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1 - - { tp: 8, conc-start: 4, conc-end: 32 } + - { tp: 8, conc-start: 4, conc-end: 16 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256 } dsr1-fp8-b200-sglang: From b5c542b2ec2836438ed3017c872244a7c2275618 Mon Sep 17 00:00:00 2001 From: shicli Date: Wed, 7 Jan 2026 15:21:44 +0800 Subject: [PATCH 08/19] larger h200 concurrency --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 4b72cc498..980020253 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -266,7 +266,7 @@ dsr1-fp8-h200-trt-mtp: osl: 1024 search-space: # MTP=3 for all, no DP_ATTN in this sequence length - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 } - isl: 1024 osl: 8192 search-space: From cdebd62f62dfba6f65e177af53b7c81ea8ef1168 Mon Sep 17 00:00:00 2001 From: shicli Date: Wed, 7 Jan 2026 15:34:24 +0800 Subject: [PATCH 09/19] fix runner --- .github/configs/nvidia-master.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 980020253..86c3dccea 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -74,11 +74,11 @@ dsr1-fp4-b200-trt: - { tp: 8, conc-start: 4, conc-end: 32 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 } -dsr1-fp4-b200-trt-mtp: +dsr1-fp4-b200-trt: image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 model: nvidia/DeepSeek-R1-0528-FP4-V2 model-prefix: dsr1 - runner: b200-trt-mtp + runner: b200-trt precision: fp4 framework: trt multinode: false @@ -174,11 +174,11 @@ dsr1-fp8-b200-trt: # If CONC > 64, then DP_ATTN=true - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } -dsr1-fp8-b200-trt-mtp: +dsr1-fp8-b200-trt: image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 - runner: b200-trt-mtp + runner: b200-trt precision: fp8 framework: trt multinode: false From 835c156af98b3bbb063eff0c24a289c9be90cc65 Mon Sep 17 00:00:00 2001 From: shicli Date: Wed, 7 Jan 2026 15:39:16 +0800 Subject: [PATCH 10/19] fix typo --- .github/configs/nvidia-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 86c3dccea..8a84770a8 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -74,7 +74,7 @@ dsr1-fp4-b200-trt: - { tp: 8, conc-start: 4, conc-end: 32 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 } -dsr1-fp4-b200-trt: +dsr1-fp4-b200-trt-mtp: image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 model: nvidia/DeepSeek-R1-0528-FP4-V2 model-prefix: dsr1 @@ -174,7 +174,7 @@ dsr1-fp8-b200-trt: # If CONC > 64, then DP_ATTN=true - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } -dsr1-fp8-b200-trt: +dsr1-fp8-b200-trt-mtp: image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 From f879ca434b3490ff6405534861b7d119ec9affb5 Mon Sep 17 00:00:00 2001 From: shicli Date: Wed, 7 Jan 2026 15:40:34 +0800 Subject: [PATCH 11/19] fix h200 runner --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 8a84770a8..6fec1b558 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -256,7 +256,7 @@ dsr1-fp8-h200-trt-mtp: image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 - runner: h200-trt-mtp + runner: h200-trt precision: fp8 framework: trt multinode: false From a5f5ebf2b39c9689f28371aecbeae25d3c64f40e Mon Sep 17 00:00:00 2001 From: shicli Date: Wed, 7 Jan 2026 15:41:40 +0800 Subject: [PATCH 12/19] fix h200 runner --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 6fec1b558..0d5a357cb 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -256,7 +256,7 @@ dsr1-fp8-h200-trt-mtp: image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 - runner: h200-trt + runner: h200 precision: fp8 framework: trt multinode: false From 92fe8725987b3a19a32c9cb0d586e2c71976c7d3 Mon Sep 17 00:00:00 2001 From: ankursingh-nv Date: Wed, 7 Jan 2026 10:27:58 -0800 Subject: [PATCH 13/19] Add MTP support for single-node TRT configs and launch Scripts --- .github/configs/nvidia-master.yaml | 54 +++++++++++++++--------------- runners/launch_b200-nb.sh | 3 +- runners/launch_b200-nv.sh | 3 +- runners/launch_h200-cw.sh | 3 +- runners/launch_h200-nb.sh | 3 +- runners/launch_h200-nv.sh | 3 +- 6 files changed, 37 insertions(+), 32 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 0d5a357cb..1ae9c90f5 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -89,41 +89,41 @@ dsr1-fp4-b200-trt-mtp: # If TP=4: # If CONC >= 16, then EP=4 # If CONC >= 128, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1 - - { tp: 4, conc-start: 4, conc-end: 8 } - - { tp: 4, ep: 4, conc-start: 16, conc-end: 64 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 256 } + - { tp: 4, conc-start: 4, conc-end: 8, spec-decoding: mtp } + - { tp: 4, ep: 4, conc-start: 16, conc-end: 64, spec-decoding: mtp } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp } # If TP=8: # If CONC >= 16, then EP=8 # If CONC >= 64, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1 - - { tp: 8, conc-start: 4, conc-end: 8 } - - { tp: 8, ep: 8, conc-start: 16, conc-end: 32 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 } + - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp } + - { tp: 8, ep: 8, conc-start: 16, conc-end: 32, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp } - isl: 1024 osl: 8192 search-space: # If TP=4: # If CONC >= 32, then EP=4 # If CONC >= 128, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1 - - { tp: 4, conc-start: 4, conc-end: 16 } - - { tp: 4, ep: 4, conc-start: 32, conc-end: 64 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 256 } + - { tp: 4, conc-start: 4, conc-end: 16, spec-decoding: mtp } + - { tp: 4, ep: 4, conc-start: 32, conc-end: 64, spec-decoding: mtp } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp } # If TP=8: # If CONC >= 8, then EP=8 # If CONC >= 128, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1 - - { tp: 8, conc-start: 4, conc-end: 4 } - - { tp: 8, ep: 8, conc-start: 8, conc-end: 64 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 } + - { tp: 8, conc-start: 4, conc-end: 4, spec-decoding: mtp } + - { tp: 8, ep: 8, conc-start: 8, conc-end: 64, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: # If TP=4: # If CONC >= 32, then EP=4, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1 - - { tp: 4, conc-start: 4, conc-end: 16 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 256 } + - { tp: 4, conc-start: 4, conc-end: 16, spec-decoding: mtp } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 256, spec-decoding: mtp } # If TP=8: # If CONC >= 32, then EP=8, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1 - - { tp: 8, conc-start: 4, conc-end: 16 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256 } + - { tp: 8, conc-start: 4, conc-end: 16, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256, spec-decoding: mtp } dsr1-fp8-b200-sglang: image: lmsysorg/sglang:v0.5.6-cu129-amd64 @@ -188,20 +188,20 @@ dsr1-fp8-b200-trt-mtp: osl: 1024 search-space: # If CONC >= 64, then DP_ATTN=true, MTP=1 - - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp } - isl: 1024 osl: 8192 search-space: # If CONC >= 128, then DP_ATTN=true, MTP=1 - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: # If CONC >= 64, then DP_ATTN=true, MTP=1 - - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp } dsr1-fp8-h200-sglang: image: lmsysorg/sglang:v0.5.6-cu129-amd64 @@ -266,19 +266,19 @@ dsr1-fp8-h200-trt-mtp: osl: 1024 search-space: # MTP=3 for all, no DP_ATTN in this sequence length - - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } - isl: 1024 osl: 8192 search-space: # If CONC >= 256, then DP_ATTN=true, MTP=1 - - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 128, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: # If CONC >= 64, then DP_ATTN=true, MTP=1 - - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp } gptoss-fp4-b200-trt: image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc2 diff --git a/runners/launch_b200-nb.sh b/runners/launch_b200-nb.sh index 1cb5c3dd1..08ed1e455 100644 --- a/runners/launch_b200-nb.sh +++ b/runners/launch_b200-nb.sh @@ -3,6 +3,7 @@ HF_HUB_CACHE_MOUNT="/mnt/data/hf-hub-cache-${USER: -1}/" PARTITION="main" FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') +SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') UCX_NET_DEVICES=eth0 @@ -19,4 +20,4 @@ srun --partition=$PARTITION --gres=gpu:$TP --exclusive \ --container-writable \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT_OFFSET=${USER: -1},UCX_NET_DEVICES=$UCX_NET_DEVICES \ -bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_slurm.sh \ No newline at end of file +bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}_slurm.sh \ No newline at end of file diff --git a/runners/launch_b200-nv.sh b/runners/launch_b200-nv.sh index 28286e2be..1305c6848 100644 --- a/runners/launch_b200-nv.sh +++ b/runners/launch_b200-nv.sh @@ -5,6 +5,7 @@ export PORT_OFFSET=0 # Doesn't matter when --exclusive MODEL_CODE="${EXP_NAME%%_*}" FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') +SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') PARTITION="dgx-b200" SQUASH_FILE="/raid/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" @@ -22,6 +23,6 @@ srun --jobid=$JOB_ID \ --container-writable \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ -bash benchmarks/${MODEL_CODE}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_slurm.sh +bash benchmarks/${MODEL_CODE}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}_slurm.sh scancel $JOB_ID diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh index 0b6740d7b..f72b8bb0d 100644 --- a/runners/launch_h200-cw.sh +++ b/runners/launch_h200-cw.sh @@ -5,6 +5,7 @@ export PORT_OFFSET=${USER: -1} MODEL_CODE="${EXP_NAME%%_*}" FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') +SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') PARTITION="h200" SQUASH_FILE="/mnt/vast/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" @@ -31,7 +32,7 @@ srun --jobid=$JOB_ID \ --container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ -bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}_slurm.sh +bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}_slurm.sh rmdir $SAGEMAKER_SHM_PATH scancel $JOB_ID diff --git a/runners/launch_h200-nb.sh b/runners/launch_h200-nb.sh index 15b6fa6c5..703bcf231 100644 --- a/runners/launch_h200-nb.sh +++ b/runners/launch_h200-nb.sh @@ -5,6 +5,7 @@ export PORT_OFFSET=${USER: -1} MODEL_CODE="${EXP_NAME%%_*}" FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') +SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') PARTITION="main" SQUASH_FILE="/home/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" @@ -29,6 +30,6 @@ srun --jobid=$JOB_ID \ --container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ -bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}_slurm.sh +bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}_slurm.sh scancel $JOB_ID diff --git a/runners/launch_h200-nv.sh b/runners/launch_h200-nv.sh index 3282be1a8..0434f880b 100644 --- a/runners/launch_h200-nv.sh +++ b/runners/launch_h200-nv.sh @@ -5,6 +5,7 @@ export PORT_OFFSET=0 # Doesn't matter when --exclusive MODEL_CODE="${EXP_NAME%%_*}" FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') +SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') PARTITION="dgx-h200" SQUASH_FILE="/raid/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" @@ -22,6 +23,6 @@ srun --jobid=$JOB_ID \ --container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ -bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}_slurm.sh +bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}_slurm.sh scancel $JOB_ID From 48f17a774cb5ceb5df7d043ca18da43a27725880 Mon Sep 17 00:00:00 2001 From: ankursingh-nv Date: Wed, 7 Jan 2026 10:35:49 -0800 Subject: [PATCH 14/19] Add MTP configs to perf-changelog --- perf-changelog.yaml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index c7f68885c..9d361003f 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -141,3 +141,20 @@ description: - Use upstream SGLang images on mi300, mi325 and mi355 for dsr1fp8 pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/332 + +- config-keys: + - gptoss-fp4-gb200-dynamo-trt + - gptoss-fp4-b200-trt + description: + - Explicitly add EP=TP for DP attention configs for B200 AGG nvidia-master file. Multinode Refactor inadvertently changed default EP=1 + - Add GPTOSS DISAGG configurations for GB200 1k1k and 8k1k. + pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/387 + +- config-keys: + - dsr1-fp4-b200-trt-mtp + - dsr1-fp8-b200-trt-mtp + - dsr1-fp8-h200-trt-mtp + description: + - Add MTP (Multi-Token Prediction) support for single-node TRT configs + - Add spec-decoding field to config entries and update launch scripts to select MTP benchmark scripts + pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/392 From 71f5d4af70c89ebec7ba210006621743897eb13e Mon Sep 17 00:00:00 2001 From: ankursingh-nv Date: Wed, 7 Jan 2026 14:27:10 -0800 Subject: [PATCH 15/19] fix perf-changelog --- perf-changelog.yaml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index ceaece321..38e0b242e 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -150,13 +150,6 @@ - Add GPTOSS DISAGG configurations for GB200 1k1k and 8k1k. pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/387 -- config-keys: - - dsr1-fp8-mi355x-sglang-disagg - description: - - "Add PD disaggregation (1P2D) for Mi355X" - - "Includes with and without speculative decoding" - pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/348 - - config-keys: - dsr1-fp4-b200-trt-mtp - dsr1-fp8-b200-trt-mtp From ba1a206e646a578708d51d8c641b0c098ec9501a Mon Sep 17 00:00:00 2001 From: shicli Date: Thu, 8 Jan 2026 10:16:46 +0800 Subject: [PATCH 16/19] fix H200 config --- .github/configs/nvidia-master.yaml | 5 +++-- benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh | 5 +++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 57be1d150..58f297834 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -265,8 +265,9 @@ dsr1-fp8-h200-trt-mtp: - isl: 1024 osl: 1024 search-space: - # MTP=3 for all, no DP_ATTN in this sequence length - - { tp: 8, ep: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } + # If CONC >= 128, then DP_ATTN=true, MTP=1 + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp } - isl: 1024 osl: 8192 search-space: diff --git a/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh index b7063f395..4e869764a 100644 --- a/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh @@ -34,6 +34,11 @@ SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=$(( 8888 + $PORT_OFFSET )) EXTRA_CONFIG_FILE="dsr1-fp8-mtp.yml" +# If ISL=8192 and DP_ATTENTION=true, export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:8192 +if [[ "$ISL" == "8192" && "$DP_ATTENTION" == "true" ]]; then + export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:8192" +fi + cat > $EXTRA_CONFIG_FILE << EOF cuda_graph_config: enable_padding: true From b1f04df94bcab81c1408e1be17e3509c364147c7 Mon Sep 17 00:00:00 2001 From: ankursingh-nv Date: Thu, 8 Jan 2026 15:06:36 -0800 Subject: [PATCH 17/19] fix per-changelog fix perf-changelog file --- perf-changelog.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index bf0b26d15..d23dd87d9 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -150,7 +150,7 @@ - Add GPTOSS DISAGG configurations for GB200 1k1k and 8k1k. pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/387 -- config-keys: +- config-keys: - dsr1-fp8-mi355x-sglang-disagg description: - "Add PD disaggregation (1P2D) for Mi355X" @@ -164,4 +164,4 @@ description: - Add MTP (Multi-Token Prediction) support for single-node TRT configs - Add spec-decoding field to config entries and update launch scripts to select MTP benchmark scripts - pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/392 \ No newline at end of file + pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/392 From dae544a58cfa8c604fff63da831f3bb5a5d68761 Mon Sep 17 00:00:00 2001 From: ankursingh-nv Date: Mon, 12 Jan 2026 10:29:12 -0800 Subject: [PATCH 18/19] no chat template --- benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh | 3 +-- benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh | 3 +-- benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh | 3 +-- perf-changelog.yaml | 9 +++------ 4 files changed, 6 insertions(+), 12 deletions(-) diff --git a/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh index 33d819efa..52ca1e9e1 100644 --- a/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh +++ b/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh @@ -100,5 +100,4 @@ run_benchmark_serving \ --num-prompts $(( $CONC * 10 )) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ - --use-chat-template + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh index 9054bd014..519e49089 100644 --- a/benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh +++ b/benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh @@ -100,5 +100,4 @@ run_benchmark_serving \ --num-prompts $(( $CONC * 10 )) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ - --use-chat-template + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh index 4e869764a..ee252098b 100644 --- a/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh @@ -105,5 +105,4 @@ run_benchmark_serving \ --num-prompts $(( $CONC * 10 )) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ - --use-chat-template + --result-dir /workspace/ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index d23dd87d9..21f2e8655 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -158,10 +158,7 @@ pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/348 - config-keys: - - dsr1-fp4-b200-trt-mtp - - dsr1-fp8-b200-trt-mtp - - dsr1-fp8-h200-trt-mtp + - dsr1-fp4-mi355x-sglang description: - - Add MTP (Multi-Token Prediction) support for single-node TRT configs - - Add spec-decoding field to config entries and update launch scripts to select MTP benchmark scripts - pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/392 + - "Updating MI355x Deepseek-R1 FP4 SGLang Image to upstream v0.5.7" + pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/395 From 055084fc4dee0bcdf7d08d6c5427d46106907ad3 Mon Sep 17 00:00:00 2001 From: ankursingh-nv Date: Mon, 12 Jan 2026 10:34:24 -0800 Subject: [PATCH 19/19] update perf-changelog --- perf-changelog.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 21f2e8655..af44b69ac 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -162,3 +162,12 @@ description: - "Updating MI355x Deepseek-R1 FP4 SGLang Image to upstream v0.5.7" pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/395 + +- config-keys: + - dsr1-fp4-b200-trt-mtp + - dsr1-fp8-b200-trt-mtp + - dsr1-fp8-h200-trt-mtp + description: + - Add MTP (Multi-Token Prediction) support for single-node TRT configs + - Add spec-decoding field to config entries and update launch scripts to select MTP benchmark scripts + pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/412 \ No newline at end of file