From b1fbf484e2980ad0ddf4ead2289db923eb8cfcef Mon Sep 17 00:00:00 2001 From: Jatin Gangani Date: Mon, 1 Dec 2025 16:37:03 -0800 Subject: [PATCH 1/3] Update GPTOSS B200 AGG --- .github/configs/nvidia-master.yaml | 13 ++++++++----- benchmarks/gptoss_fp4_b200_trt_slurm.sh | 4 +++- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 954abbba2..994209394 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -167,25 +167,27 @@ dsr1-fp8-h200-trt: - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64 } gptoss-fp4-b200-trt: - image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1 + image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc2 model: openai/gpt-oss-120b model-prefix: gptoss runner: b200-trt precision: fp4 framework: trt - # For all sequence lengths, if CONC >= 256, then EP=TP and DP_ATTN=true seq-len-configs: - isl: 1024 osl: 1024 search-space: + - { tp: 2, dp-attn: true, conc-start: 32, conc-end: 128 } + - { tp: 4, dp-attn: true, conc-start: 32, conc-end: 64 } - { tp: 1, conc-start: 64, conc-end: 128 } - - { tp: 2, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 128 } + - { tp: 2, conc-start: 4, conc-end: 32 } + - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 8 } - isl: 1024 osl: 8192 search-space: - { tp: 1, conc-start: 64, conc-end: 128 } + - { tp: 2, dp-attn: true, conc-start: 64, conc-end: 128 } - { tp: 2, conc-start: 4, conc-end: 128 } - { tp: 4, conc-start: 4, conc-end: 128 } - { tp: 8, conc-start: 4, conc-end: 16 } @@ -193,8 +195,9 @@ gptoss-fp4-b200-trt: osl: 1024 search-space: - { tp: 1, conc-start: 64, conc-end: 128 } + - { tp: 2, dp-attn: true, conc-start: 64, conc-end: 128 } - { tp: 2, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 32 } - { tp: 8, conc-start: 4, conc-end: 8 } gptoss-fp4-b200-vllm: diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh index 44e9dbf4c..463e1eca9 100644 --- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh @@ -31,7 +31,9 @@ echo "MOE_BACKEND set to '$MOE_BACKEND'" EXTRA_CONFIG_FILE="gptoss-fp4.yml" export TRTLLM_ENABLE_PDL=1 -export NCCL_GRAPH_REGISTER=0 +export TRTLLM_MOE_ALLTOALL_BACKEND="mnnvlthroughput" +export TRTLLM_FORCE_ALLTOALL_METHOD="MNNVL" +export TRTLLM_MOE_A2A_WORKSPACE_MB="2048" cat > $EXTRA_CONFIG_FILE << EOF cuda_graph_config: From 1abbbb68c01801603315afd6edec9dd4e9e9f868 Mon Sep 17 00:00:00 2001 From: Jatin Gangani Date: Mon, 1 Dec 2025 23:31:59 -0800 Subject: [PATCH 2/3] set dp attention env vars --- benchmarks/gptoss_fp4_b200_trt_slurm.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh index 463e1eca9..35ed2c58a 100644 --- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh @@ -31,9 +31,6 @@ echo "MOE_BACKEND set to '$MOE_BACKEND'" EXTRA_CONFIG_FILE="gptoss-fp4.yml" export TRTLLM_ENABLE_PDL=1 -export TRTLLM_MOE_ALLTOALL_BACKEND="mnnvlthroughput" -export TRTLLM_FORCE_ALLTOALL_METHOD="MNNVL" -export TRTLLM_MOE_A2A_WORKSPACE_MB="2048" cat > $EXTRA_CONFIG_FILE << EOF cuda_graph_config: @@ -52,6 +49,9 @@ moe_config: EOF if [[ "$DP_ATTENTION" == "true" ]]; then + export TRTLLM_MOE_ALLTOALL_BACKEND="mnnvlthroughput" + export TRTLLM_FORCE_ALLTOALL_METHOD="MNNVL" + export TRTLLM_MOE_A2A_WORKSPACE_MB="2048" cat << EOF >> $EXTRA_CONFIG_FILE attention_dp_config: enable_balance: true From eeeb6cdd0cdec5a5c9dd57f78c552d98b24488a1 Mon Sep 17 00:00:00 2001 From: Jatin Gangani Date: Tue, 2 Dec 2025 14:53:36 -0800 Subject: [PATCH 3/3] Add DP attn comment --- .github/configs/nvidia-master.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 994209394..a3d848475 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -173,6 +173,7 @@ gptoss-fp4-b200-trt: runner: b200-trt precision: fp4 framework: trt + # Enable DP_ATTENTION for conc >= 32 seq-len-configs: - isl: 1024 osl: 1024