diff --git a/benchmarks/gptoss_fp4_b200_trt_docker.sh b/benchmarks/gptoss_fp4_b200_trt_docker.sh new file mode 100644 index 000000000..1f5fbe868 --- /dev/null +++ b/benchmarks/gptoss_fp4_b200_trt_docker.sh @@ -0,0 +1,90 @@ +#!/usr/bin/env bash + +# === Required Env Vars === +# MODEL +# PORT +# TP +# EP_SIZE +# DP_ATTENTION +# CONC +# ISL +# OSL +# MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# NUM_PROMPTS +# RESULT_FILENAME + +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) + +# GPTOSS TRTLLM Deployment Guide: +# https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/deployment-guide/quick-start-recipe-for-gpt-oss-on-trtllm.md + +MOE_BACKEND="TRTLLM" +echo "MOE_BACKEND set to '$MOE_BACKEND'" + +EXTRA_CONFIG_FILE="gptoss-fp4.yml" +export TRTLLM_ENABLE_PDL=1 +export NCCL_GRAPH_REGISTER=0 + +cat > $EXTRA_CONFIG_FILE << EOF +cuda_graph_config: + enable_padding: true + max_batch_size: $CONC +enable_attention_dp: $DP_ATTENTION +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: $MOE_BACKEND +EOF + +if [[ "$DP_ATTENTION" == "true" ]]; then + cat << EOF >> $EXTRA_CONFIG_FILE +attention_dp_config: + enable_balance: true +EOF +fi + +echo "Generated config file contents:" +cat $EXTRA_CONFIG_FILE + +set -x + +MAX_NUM_TOKENS=20000 + +# Launch TRT-LLM server +mpirun -n 1 --oversubscribe --allow-run-as-root \ + trtllm-serve $MODEL --port=$PORT \ + --trust_remote_code \ + --backend=pytorch \ + --max_batch_size 512 \ + --max_seq_len=$MAX_MODEL_LEN \ + --max_num_tokens=$MAX_NUM_TOKENS \ + --tp_size=$TP --ep_size=$EP_SIZE \ + --extra_llm_api_options=$EXTRA_CONFIG_FILE > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend openai \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$NUM_PROMPTS" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ No newline at end of file diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 926ac7e1d..995c0a684 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -86,3 +86,8 @@ description: | - Updating MI355x Deepseek-R1 FP4 SGLang Image to upstream v0.5.6.post1 PR: https://github.com/InferenceMAX/InferenceMAX/pull/330 +- config-keys: + - gptoss-fp4-b200-trt + description: | + - Add benchmark script for GPTOSS FP4 B200 TRT-LLM + PR: https://github.com/InferenceMAX/InferenceMAX/pull/256