diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 954abbba2..a3d848475 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -167,25 +167,28 @@ dsr1-fp8-h200-trt: - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64 } gptoss-fp4-b200-trt: - image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1 + image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc2 model: openai/gpt-oss-120b model-prefix: gptoss runner: b200-trt precision: fp4 framework: trt - # For all sequence lengths, if CONC >= 256, then EP=TP and DP_ATTN=true + # Enable DP_ATTENTION for conc >= 32 seq-len-configs: - isl: 1024 osl: 1024 search-space: + - { tp: 2, dp-attn: true, conc-start: 32, conc-end: 128 } + - { tp: 4, dp-attn: true, conc-start: 32, conc-end: 64 } - { tp: 1, conc-start: 64, conc-end: 128 } - - { tp: 2, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 128 } + - { tp: 2, conc-start: 4, conc-end: 32 } + - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 8 } - isl: 1024 osl: 8192 search-space: - { tp: 1, conc-start: 64, conc-end: 128 } + - { tp: 2, dp-attn: true, conc-start: 64, conc-end: 128 } - { tp: 2, conc-start: 4, conc-end: 128 } - { tp: 4, conc-start: 4, conc-end: 128 } - { tp: 8, conc-start: 4, conc-end: 16 } @@ -193,8 +196,9 @@ gptoss-fp4-b200-trt: osl: 1024 search-space: - { tp: 1, conc-start: 64, conc-end: 128 } + - { tp: 2, dp-attn: true, conc-start: 64, conc-end: 128 } - { tp: 2, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 32 } - { tp: 8, conc-start: 4, conc-end: 8 } gptoss-fp4-b200-vllm: diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh index 44e9dbf4c..35ed2c58a 100644 --- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh @@ -31,7 +31,6 @@ echo "MOE_BACKEND set to '$MOE_BACKEND'" EXTRA_CONFIG_FILE="gptoss-fp4.yml" export TRTLLM_ENABLE_PDL=1 -export NCCL_GRAPH_REGISTER=0 cat > $EXTRA_CONFIG_FILE << EOF cuda_graph_config: @@ -50,6 +49,9 @@ moe_config: EOF if [[ "$DP_ATTENTION" == "true" ]]; then + export TRTLLM_MOE_ALLTOALL_BACKEND="mnnvlthroughput" + export TRTLLM_FORCE_ALLTOALL_METHOD="MNNVL" + export TRTLLM_MOE_A2A_WORKSPACE_MB="2048" cat << EOF >> $EXTRA_CONFIG_FILE attention_dp_config: enable_balance: true