Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 90 additions & 0 deletions benchmarks/gptoss_fp4_b200_trt_docker.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#!/usr/bin/env bash

# === Required Env Vars ===
# MODEL
# PORT
# TP
# EP_SIZE
# DP_ATTENTION
# CONC
# ISL
# OSL
# MAX_MODEL_LEN
# RANDOM_RANGE_RATIO
# NUM_PROMPTS
# RESULT_FILENAME

SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)

# GPTOSS TRTLLM Deployment Guide:
# https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/deployment-guide/quick-start-recipe-for-gpt-oss-on-trtllm.md

MOE_BACKEND="TRTLLM"
echo "MOE_BACKEND set to '$MOE_BACKEND'"

EXTRA_CONFIG_FILE="gptoss-fp4.yml"
export TRTLLM_ENABLE_PDL=1
export NCCL_GRAPH_REGISTER=0

cat > $EXTRA_CONFIG_FILE << EOF
cuda_graph_config:
enable_padding: true
max_batch_size: $CONC
enable_attention_dp: $DP_ATTENTION
kv_cache_config:
dtype: fp8
enable_block_reuse: false
free_gpu_memory_fraction: 0.85
print_iter_log: true
stream_interval: 20
num_postprocess_workers: 4
moe_config:
backend: $MOE_BACKEND
EOF

if [[ "$DP_ATTENTION" == "true" ]]; then
cat << EOF >> $EXTRA_CONFIG_FILE
attention_dp_config:
enable_balance: true
EOF
fi

echo "Generated config file contents:"
cat $EXTRA_CONFIG_FILE

set -x

MAX_NUM_TOKENS=20000

# Launch TRT-LLM server
mpirun -n 1 --oversubscribe --allow-run-as-root \
trtllm-serve $MODEL --port=$PORT \
--trust_remote_code \
--backend=pytorch \
--max_batch_size 512 \
--max_seq_len=$MAX_MODEL_LEN \
--max_num_tokens=$MAX_NUM_TOKENS \
--tp_size=$TP --ep_size=$EP_SIZE \
--extra_llm_api_options=$EXTRA_CONFIG_FILE > $SERVER_LOG 2>&1 &

SERVER_PID=$!

# Source benchmark utilities
source "$(dirname "$0")/benchmark_lib.sh"

# Wait for server to be ready
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

pip install -q datasets pandas

run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend openai \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$NUM_PROMPTS" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/
5 changes: 5 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -86,3 +86,8 @@
description: |
- Updating MI355x Deepseek-R1 FP4 SGLang Image to upstream v0.5.6.post1
PR: https://github.com/InferenceMAX/InferenceMAX/pull/330
- config-keys:
- gptoss-fp4-b200-trt
description: |
- Add benchmark script for GPTOSS FP4 B200 TRT-LLM
PR: https://github.com/InferenceMAX/InferenceMAX/pull/256
Loading