From a1a0325b206d597742156ac92ee024b434ee71f9 Mon Sep 17 00:00:00 2001 From: Elnifio Date: Tue, 9 Dec 2025 11:00:04 -0800 Subject: [PATCH 01/23] bring all configs here --- .github/configs/nvidia-master.yaml | 108 ++++++++++++++++-- .../dsr1_fp4_gb200_dynamo-sglang_slurm.sh | 37 ++++++ .../dsr1_fp8_gb200_dynamo-sglang_slurm.sh | 8 +- 3 files changed, 141 insertions(+), 12 deletions(-) create mode 100644 benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 1242386de..bef1b90e6 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -798,6 +798,7 @@ dsr1-fp8-gb200-dynamo-sglang: additional-settings: - "PREFILL_NODES=4" - "N_ADDITIONAL_FRONTENDS=9" + - "SCRIPT_MODE=1k1k-max-tpt" decode: num-worker: 1 tp: 1 @@ -819,7 +820,7 @@ dsr1-fp8-gb200-dynamo-sglang: additional-settings: - "PREFILL_NODES=1" - "N_ADDITIONAL_FRONTENDS=9" - - "SCRIPT_MODE=1p_4d" + - "SCRIPT_MODE=1k1k-low-latency" decode: num-worker: 4 tp: 1 @@ -841,6 +842,7 @@ dsr1-fp8-gb200-dynamo-sglang: additional-settings: - "PREFILL_NODES=6" - "N_ADDITIONAL_FRONTENDS=9" + - "SCRIPT_MODE=1k1k-max-tpt" decode: num-worker: 1 tp: 1 @@ -852,22 +854,114 @@ dsr1-fp8-gb200-dynamo-sglang: - isl: 8192 osl: 1024 search-space: + # Low latency (1 prefill worker at DEP4 and 1 decode worker at DEP4) - spec-decoding: "none" - conc-list: [ 128, 256, 384, 448, 512, 576, 1024, 2048, 4096 ] + conc-list: [ 4, 8, 16, 32, 64, 128, 256, 512 ] prefill: - num-worker: 6 - # tp, ep, and dp-attn do nothing because they are hardcoded in the following file: - # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh + num-worker: 1 tp: 1 ep: 1 dp-attn: true additional-settings: - - "PREFILL_NODES=12" + - "PREFILL_NODES=1" - "N_ADDITIONAL_FRONTENDS=8" + - "SCRIPT_MODE=8k1k-low-latency" decode: num-worker: 1 tp: 1 ep: 1 dp-attn: true additional-settings: - - "DECODE_NODES=6" + - "DECODE_NODES=1" + + # Middle and top of curve (5 prefill workers each at DEP8 and 1 decode worker at DEP32) + - spec-decoding: "none" + conc-list: [ 512, 1024, 2048, 6144 ] + prefill: + num-worker: 5 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + - "N_ADDITIONAL_FRONTENDS=8" + - "SCRIPT_MODE=8k1k-max-tpt" + decode: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=8" + +dsr1-fp4-gb200-dynamo-sglang: + # TODO: swap + image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1 + # TODO: what is the right name? + model: deepseek-ai/DeepSeek-R1-0528-fp4-v2 + model-prefix: dsr1 + runner: gb200 + precision: fp4 + framework: dynamo-sglang + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # Low latency (1 prefill worker at DEP4 and 2 decode workers at DEP4) + - spec-decoding: "none" + conc-list: [ 4, 8, 32, 64, 128, 112, 128, 256 ] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + - "N_ADDITIONAL_FRONTENDS=8" + decode: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=2" + + # Mid curve (1 prefill worker at DEP4 and 1 decode workers at DEP48) + - spec-decoding: "none" + conc-list: [ 512, 1024, 2048, 4096, 8192 ] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + - "N_ADDITIONAL_FRONTENDS=8" + decode: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=12" + + # Top of curve (1 prefill worker at DEP4 and 1 decode worker at DEP32) + - spec-decoding: "none" + conc-list: [ 8192, 12000, 15000 ] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + - "N_ADDITIONAL_FRONTENDS=8" + decode: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=8" \ No newline at end of file diff --git a/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh b/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh new file mode 100644 index 000000000..7a668f30c --- /dev/null +++ b/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh @@ -0,0 +1,37 @@ + +#!/bin/bash + +set -x + +source "$(dirname "$0")/benchmark_lib.sh" + +check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING MODEL_PATH \ + PREFILL_NUM_WORKERS PREFILL_TP PREFILL_EP PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS DECODE_TP DECODE_EP DECODE_DP_ATTN \ + PREFILL_NODES DECODE_NODES N_ADDITIONAL_FRONTENDS SGL_SLURM_JOBS_PATH # SGL_SLURM_JOBS_PATH FIXME + +# Always clone and setup Dynamo +echo "Cloning Dynamo repository..." +git clone --branch ishan/sa-1.1-sgl-dsr1 https://github.com/ai-dynamo/dynamo.git + +cd "$SGL_SLURM_JOBS_PATH" + +# Set up SGL launch script-specific environment variables +export TIME_LIMIT="04:00:00" +export MODEL_PATH=$MODEL_PATH +export CONFIG_DIR=$CONFIG_DIR +export CONTAINER_IMAGE=$IMAGE +export GPU_TYPE="gb200-fp4" + +# Launch jobs based on ISL/OSL +# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented +# by a list of numbers delimted by 'x'. This is because of how the underlying launch script +# expects the concurrencies. +bash ./submit_disagg.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $N_ADDITIONAL_FRONTENDS \ + $ISL $OSL "${CONC_LIST// /x}" inf \ + $GPU_TYPE \ + $SCRIPT_MODE \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh b/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh index 99e2c7afd..4e44b0414 100644 --- a/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh +++ b/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh @@ -12,11 +12,7 @@ check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING MODEL_PATH \ # Always clone and setup Dynamo echo "Cloning Dynamo repository..." -if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then - git clone --branch ishan/sa-1.1-sgl-dsr1-fp8 https://github.com/ai-dynamo/dynamo.git -else - git clone --branch update-result-file-name https://github.com/Elnifio/dynamo.git -fi +git clone --branch ishan/sa-1.1-sgl-dsr1 https://github.com/ai-dynamo/dynamo.git cd "$SGL_SLURM_JOBS_PATH" @@ -25,6 +21,7 @@ export TIME_LIMIT="04:00:00" export MODEL_PATH=$MODEL_PATH export CONFIG_DIR=$CONFIG_DIR export CONTAINER_IMAGE=$IMAGE +export GPU_TYPE="gb200-fp8" # Launch jobs based on ISL/OSL # Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented @@ -36,4 +33,5 @@ bash ./submit_disagg.sh $PREFILL_NODES \ $DECODE_NUM_WORKERS \ $N_ADDITIONAL_FRONTENDS \ $ISL $OSL "${CONC_LIST// /x}" inf \ + $GPU_TYPE \ $SCRIPT_MODE \ No newline at end of file From c03076b0d4782f8a011f0a44c15ccac5dfa93761 Mon Sep 17 00:00:00 2001 From: Elnifio Date: Tue, 9 Dec 2025 11:19:36 -0800 Subject: [PATCH 02/23] test for GB200 only --- .github/workflows/full-sweep-1k1k-scheduler.yml | 5 +++-- .github/workflows/full-sweep-8k1k-scheduler.yml | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/full-sweep-1k1k-scheduler.yml b/.github/workflows/full-sweep-1k1k-scheduler.yml index 8b32f47c0..fcd23e96f 100644 --- a/.github/workflows/full-sweep-1k1k-scheduler.yml +++ b/.github/workflows/full-sweep-1k1k-scheduler.yml @@ -18,13 +18,14 @@ jobs: - id: get-dsr1-configs run: | pip install pydantic - CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k1k --model-prefix dsr1 --framework dynamo-sglang --runner-type gb200 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k1k --model-prefix dsr1 --framework dynamo-sglang --runner-type gb200 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT get-gptoss-configs: runs-on: ubuntu-latest + if: ${{ false }} outputs: multi-node-search-space-config: ${{ steps.get-gptoss-configs.outputs.multi-node-search-space-config }} single-node-search-space-config: ${{ steps.get-gptoss-configs.outputs.single-node-search-space-config }} diff --git a/.github/workflows/full-sweep-8k1k-scheduler.yml b/.github/workflows/full-sweep-8k1k-scheduler.yml index 629e56bd9..6ef3d0e5a 100644 --- a/.github/workflows/full-sweep-8k1k-scheduler.yml +++ b/.github/workflows/full-sweep-8k1k-scheduler.yml @@ -18,13 +18,14 @@ jobs: - id: get-dsr1-configs run: | pip install pydantic - CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 8k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 8k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 8k1k --model-prefix dsr1 --framework dynamo-sglang --runner-type gb200 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 8k1k --model-prefix dsr1 --framework dynamo-sglang --runner-type gb200 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT get-gptoss-configs: runs-on: ubuntu-latest + if: ${{ false }} outputs: multi-node-search-space-config: ${{ steps.get-gptoss-configs.outputs.multi-node-search-space-config }} single-node-search-space-config: ${{ steps.get-gptoss-configs.outputs.single-node-search-space-config }} From 028f22431ed7a40b07e8d18b478235787d855579 Mon Sep 17 00:00:00 2001 From: Elnifio Date: Tue, 9 Dec 2025 11:27:04 -0800 Subject: [PATCH 03/23] updates the files and git clone urls --- benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh | 5 +++-- benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh | 5 +++-- runners/launch_gb200-nv.sh | 11 ++--------- 3 files changed, 8 insertions(+), 13 deletions(-) diff --git a/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh b/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh index 7a668f30c..ae9059e38 100644 --- a/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh +++ b/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh @@ -12,7 +12,8 @@ check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING MODEL_PATH \ # Always clone and setup Dynamo echo "Cloning Dynamo repository..." -git clone --branch ishan/sa-1.1-sgl-dsr1 https://github.com/ai-dynamo/dynamo.git +git clone https://github.com/ai-dynamo/dynamo.git +cd dynamo && git checkout 80dfb82c5934aa1605105baed49403e74b83a779 && cd .. cd "$SGL_SLURM_JOBS_PATH" @@ -34,4 +35,4 @@ bash ./submit_disagg.sh $PREFILL_NODES \ $N_ADDITIONAL_FRONTENDS \ $ISL $OSL "${CONC_LIST// /x}" inf \ $GPU_TYPE \ - $SCRIPT_MODE \ No newline at end of file + $SCRIPT_MODE diff --git a/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh b/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh index 4e44b0414..98ab22243 100644 --- a/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh +++ b/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh @@ -12,7 +12,8 @@ check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING MODEL_PATH \ # Always clone and setup Dynamo echo "Cloning Dynamo repository..." -git clone --branch ishan/sa-1.1-sgl-dsr1 https://github.com/ai-dynamo/dynamo.git +git clone https://github.com/ai-dynamo/dynamo.git +cd dynamo && git checkout 80dfb82c5934aa1605105baed49403e74b83a779 && cd .. cd "$SGL_SLURM_JOBS_PATH" @@ -34,4 +35,4 @@ bash ./submit_disagg.sh $PREFILL_NODES \ $N_ADDITIONAL_FRONTENDS \ $ISL $OSL "${CONC_LIST// /x}" inf \ $GPU_TYPE \ - $SCRIPT_MODE \ No newline at end of file + $SCRIPT_MODE diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index d9164469e..d4a61e9eb 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -22,14 +22,7 @@ if [[ $FRAMEWORK == "dynamo-sglang" ]]; then fi export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528" export CONFIG_DIR="/mnt/lustre01/artifacts/sglang-configs/1k1k" - - # FIXME: Another workaround for all the different branching - # THIS NEEDS TO BE STANDARDIZED ASAP - if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then - export SGL_SLURM_JOBS_PATH="dynamo/examples/backends/sglang/slurm_jobs" - else - export SGL_SLURM_JOBS_PATH="dynamo/components/backends/sglang/slurm_jobs" - fi + export SGL_SLURM_JOBS_PATH="dynamo/examples/backends/sglang/slurm_jobs" else SQUASH_FILE="/mnt/lustre01/users/sa-shared/images/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" srun --partition=$SLURM_PARTITION --exclusive --time=180 bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" @@ -148,4 +141,4 @@ PY done fi -echo "All result files processed" \ No newline at end of file +echo "All result files processed" From 25a19b1aba97ef15d1f2ff14bd14670caeb8ab70 Mon Sep 17 00:00:00 2001 From: Elnifio Date: Wed, 10 Dec 2025 10:33:27 -0800 Subject: [PATCH 04/23] update the prefill nodes --- .github/configs/nvidia-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index bef1b90e6..79ffbc806 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -883,7 +883,7 @@ dsr1-fp8-gb200-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "PREFILL_NODES=2" + - "PREFILL_NODES=10" - "N_ADDITIONAL_FRONTENDS=8" - "SCRIPT_MODE=8k1k-max-tpt" decode: @@ -964,4 +964,4 @@ dsr1-fp4-gb200-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "DECODE_NODES=8" \ No newline at end of file + - "DECODE_NODES=8" From 124ddf45974235e4f888d32fcb68672aec0af086 Mon Sep 17 00:00:00 2001 From: Elnifio Date: Wed, 10 Dec 2025 10:40:15 -0800 Subject: [PATCH 05/23] update 1k1k fp4 config --- .github/configs/nvidia-master.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 79ffbc806..411b4d71d 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -920,6 +920,7 @@ dsr1-fp4-gb200-dynamo-sglang: additional-settings: - "PREFILL_NODES=1" - "N_ADDITIONAL_FRONTENDS=8" + - "SCRIPT_MODE=1k1k-low-latency" decode: num-worker: 2 tp: 1 @@ -939,6 +940,7 @@ dsr1-fp4-gb200-dynamo-sglang: additional-settings: - "PREFILL_NODES=1" - "N_ADDITIONAL_FRONTENDS=8" + - "SCRIPT_MODE=1k1k-middle-curve" decode: num-worker: 2 tp: 1 @@ -958,6 +960,7 @@ dsr1-fp4-gb200-dynamo-sglang: additional-settings: - "PREFILL_NODES=1" - "N_ADDITIONAL_FRONTENDS=8" + - "SCRIPT_MODE=1k1k-max-tpt" decode: num-worker: 2 tp: 1 From 61990310c4ead157e03315dc5a3d186654e43d29 Mon Sep 17 00:00:00 2001 From: Elnifio Date: Wed, 10 Dec 2025 10:42:18 -0800 Subject: [PATCH 06/23] updates to run 1k1k fp4 only --- .github/workflows/full-sweep-1k1k-scheduler.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/full-sweep-1k1k-scheduler.yml b/.github/workflows/full-sweep-1k1k-scheduler.yml index fcd23e96f..372fd9622 100644 --- a/.github/workflows/full-sweep-1k1k-scheduler.yml +++ b/.github/workflows/full-sweep-1k1k-scheduler.yml @@ -18,8 +18,8 @@ jobs: - id: get-dsr1-configs run: | pip install pydantic - CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k1k --model-prefix dsr1 --framework dynamo-sglang --runner-type gb200 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k1k --model-prefix dsr1 --framework dynamo-sglang --runner-type gb200 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k1k --model-prefix dsr1 --framework dynamo-sglang --runner-type gb200 --precision fp4 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k1k --model-prefix dsr1 --framework dynamo-sglang --runner-type gb200 --precision fp4 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT From 344ac6c2982f9940725127d97f45a01b26e8fccc Mon Sep 17 00:00:00 2001 From: Elnifio Date: Wed, 10 Dec 2025 15:02:20 -0800 Subject: [PATCH 07/23] updates the FP4 8k1k --- .github/configs/nvidia-master.yaml | 57 +++++++++++++++++++ .../dsr1_fp4_gb200_dynamo-sglang_slurm.sh | 7 ++- runners/launch_gb200-nv.sh | 7 +-- 3 files changed, 64 insertions(+), 7 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 411b4d71d..2ff34c42d 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -968,3 +968,60 @@ dsr1-fp4-gb200-dynamo-sglang: dp-attn: true additional-settings: - "DECODE_NODES=8" + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 4, 8, 32, 64 ] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "N_ADDITIONAL_FRONTENDS=8" + - "SCRIPT_MODE=8k1k-low-latency" + decode: + num-worker: 4 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=4" + - spec-decoding: "none" + conc-list: [ 512, 1024, 2048, 4096 ] + prefill: + num-worker: 6 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=6" + - "N_ADDITIONAL_FRONTENDS=9" + - "SCRIPT_MODE=8k1k-middle-curve" + decode: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=12" + - spec-decoding: "none" + conc-list: [ 1024, 2048, 8192, 10240 ] + prefill: + num-worker: 10 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=10" + - "N_ADDITIONAL_FRONTENDS=8" + - "SCRIPT_MODE=8k1k-max-tpt" + decode: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=8" diff --git a/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh b/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh index ae9059e38..40d1e8345 100644 --- a/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh +++ b/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh @@ -13,7 +13,12 @@ check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING MODEL_PATH \ # Always clone and setup Dynamo echo "Cloning Dynamo repository..." git clone https://github.com/ai-dynamo/dynamo.git -cd dynamo && git checkout 80dfb82c5934aa1605105baed49403e74b83a779 && cd .. +if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then + cd dynamo && git checkout 80dfb82c5934aa1605105baed49403e74b83a779 && cd .. +else + cd dynamo && git checkout ishan/fp48k1k && cd .. # temporarily check out this branch until this is merged +fi + cd "$SGL_SLURM_JOBS_PATH" diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index d4a61e9eb..3867aa00a 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -14,12 +14,7 @@ export SLURM_JOB_NAME="benchmark-dynamo.job" ### FRAMEWORK_DIFF_IF_STATEMENT #1 - difference in setting up envvars if [[ $FRAMEWORK == "dynamo-sglang" ]]; then - # Set IMAGE based on ISL/OSL - if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then - export IMAGE="/mnt/lustre01/artifacts/containers/lmsysorg+sglang+v0.5.5.post2.sqsh" - else - export IMAGE="/mnt/lustre01/artifacts/containers/dynamo-sglang.sqsh" - fi + export IMAGE="/mnt/lustre01/artifacts/containers/lmsysorg+sglang+v0.5.5.post2.sqsh" export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528" export CONFIG_DIR="/mnt/lustre01/artifacts/sglang-configs/1k1k" export SGL_SLURM_JOBS_PATH="dynamo/examples/backends/sglang/slurm_jobs" From 355773a13e71f34e3dc5a7076b80abb1fde00674 Mon Sep 17 00:00:00 2001 From: Elnifio Date: Wed, 10 Dec 2025 15:42:33 -0800 Subject: [PATCH 08/23] update the model path --- runners/launch_gb200-nv.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 3867aa00a..fdde90cf3 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -15,7 +15,13 @@ export SLURM_JOB_NAME="benchmark-dynamo.job" ### FRAMEWORK_DIFF_IF_STATEMENT #1 - difference in setting up envvars if [[ $FRAMEWORK == "dynamo-sglang" ]]; then export IMAGE="/mnt/lustre01/artifacts/containers/lmsysorg+sglang+v0.5.5.post2.sqsh" - export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528" + + if [[ $PRECISION == "fp4" ]]; then + export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528-fp4-v2" + else + export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528" + fi + export CONFIG_DIR="/mnt/lustre01/artifacts/sglang-configs/1k1k" export SGL_SLURM_JOBS_PATH="dynamo/examples/backends/sglang/slurm_jobs" else From 0dd1e5a6b12c9f7a69bedc4404c8512c0caf0051 Mon Sep 17 00:00:00 2001 From: Elnifio Date: Wed, 10 Dec 2025 15:44:17 -0800 Subject: [PATCH 09/23] restore changes to full sweeps --- .github/workflows/full-sweep-1k1k-scheduler.yml | 5 ++--- .github/workflows/full-sweep-8k1k-scheduler.yml | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/.github/workflows/full-sweep-1k1k-scheduler.yml b/.github/workflows/full-sweep-1k1k-scheduler.yml index 372fd9622..8b32f47c0 100644 --- a/.github/workflows/full-sweep-1k1k-scheduler.yml +++ b/.github/workflows/full-sweep-1k1k-scheduler.yml @@ -18,14 +18,13 @@ jobs: - id: get-dsr1-configs run: | pip install pydantic - CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k1k --model-prefix dsr1 --framework dynamo-sglang --runner-type gb200 --precision fp4 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k1k --model-prefix dsr1 --framework dynamo-sglang --runner-type gb200 --precision fp4 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT get-gptoss-configs: runs-on: ubuntu-latest - if: ${{ false }} outputs: multi-node-search-space-config: ${{ steps.get-gptoss-configs.outputs.multi-node-search-space-config }} single-node-search-space-config: ${{ steps.get-gptoss-configs.outputs.single-node-search-space-config }} diff --git a/.github/workflows/full-sweep-8k1k-scheduler.yml b/.github/workflows/full-sweep-8k1k-scheduler.yml index 6ef3d0e5a..629e56bd9 100644 --- a/.github/workflows/full-sweep-8k1k-scheduler.yml +++ b/.github/workflows/full-sweep-8k1k-scheduler.yml @@ -18,14 +18,13 @@ jobs: - id: get-dsr1-configs run: | pip install pydantic - CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 8k1k --model-prefix dsr1 --framework dynamo-sglang --runner-type gb200 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 8k1k --model-prefix dsr1 --framework dynamo-sglang --runner-type gb200 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 8k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 8k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT get-gptoss-configs: runs-on: ubuntu-latest - if: ${{ false }} outputs: multi-node-search-space-config: ${{ steps.get-gptoss-configs.outputs.multi-node-search-space-config }} single-node-search-space-config: ${{ steps.get-gptoss-configs.outputs.single-node-search-space-config }} From 7da0be5313e153522c7b37c9d4a5ca3a3a7ee0ad Mon Sep 17 00:00:00 2001 From: Elnifio Date: Thu, 11 Dec 2025 09:50:57 -0800 Subject: [PATCH 10/23] updates the config for 1k1k fp4 --- .github/configs/nvidia-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 2ff34c42d..082ac88c4 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -942,7 +942,7 @@ dsr1-fp4-gb200-dynamo-sglang: - "N_ADDITIONAL_FRONTENDS=8" - "SCRIPT_MODE=1k1k-middle-curve" decode: - num-worker: 2 + num-worker: 1 tp: 1 ep: 1 dp-attn: true @@ -962,7 +962,7 @@ dsr1-fp4-gb200-dynamo-sglang: - "N_ADDITIONAL_FRONTENDS=8" - "SCRIPT_MODE=1k1k-max-tpt" decode: - num-worker: 2 + num-worker: 1 tp: 1 ep: 1 dp-attn: true From b38b633e09925f74e7b2b15f695e1f2bd4042332 Mon Sep 17 00:00:00 2001 From: Elnifio Date: Thu, 11 Dec 2025 12:09:52 -0800 Subject: [PATCH 11/23] temporarily disable some concurrencies --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 082ac88c4..7c4585d9c 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1008,7 +1008,7 @@ dsr1-fp4-gb200-dynamo-sglang: additional-settings: - "DECODE_NODES=12" - spec-decoding: "none" - conc-list: [ 1024, 2048, 8192, 10240 ] + conc-list: [ 1024, 2048, 8192 ] prefill: num-worker: 10 tp: 1 From 8136816bbf8a403fff1f4d43d933d458f299543e Mon Sep 17 00:00:00 2001 From: Elnifio Date: Thu, 11 Dec 2025 17:26:59 -0800 Subject: [PATCH 12/23] updates the params --- .github/configs/nvidia-master.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 7c4585d9c..436d8c57f 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -933,12 +933,12 @@ dsr1-fp4-gb200-dynamo-sglang: - spec-decoding: "none" conc-list: [ 512, 1024, 2048, 4096, 8192 ] prefill: - num-worker: 1 + num-worker: 4 tp: 1 ep: 1 dp-attn: true additional-settings: - - "PREFILL_NODES=1" + - "PREFILL_NODES=4" - "N_ADDITIONAL_FRONTENDS=8" - "SCRIPT_MODE=1k1k-middle-curve" decode: @@ -953,12 +953,12 @@ dsr1-fp4-gb200-dynamo-sglang: - spec-decoding: "none" conc-list: [ 8192, 12000, 15000 ] prefill: - num-worker: 1 + num-worker: 4 tp: 1 ep: 1 dp-attn: true additional-settings: - - "PREFILL_NODES=1" + - "PREFILL_NODES=4" - "N_ADDITIONAL_FRONTENDS=8" - "SCRIPT_MODE=1k1k-max-tpt" decode: From c1f1be4bab6c7deef33d245adeedba1489fbec76 Mon Sep 17 00:00:00 2001 From: Elnifio Date: Fri, 12 Dec 2025 14:19:41 -0800 Subject: [PATCH 13/23] updates the branch --- benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh | 7 +------ benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh | 2 +- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh b/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh index 40d1e8345..70b9c699e 100644 --- a/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh +++ b/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh @@ -13,12 +13,7 @@ check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING MODEL_PATH \ # Always clone and setup Dynamo echo "Cloning Dynamo repository..." git clone https://github.com/ai-dynamo/dynamo.git -if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then - cd dynamo && git checkout 80dfb82c5934aa1605105baed49403e74b83a779 && cd .. -else - cd dynamo && git checkout ishan/fp48k1k && cd .. # temporarily check out this branch until this is merged -fi - +cd dynamo && git checkout ishan/fp48k1k && cd .. # temporarily check out this branch until this is merged cd "$SGL_SLURM_JOBS_PATH" diff --git a/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh b/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh index 98ab22243..cabd17b7f 100644 --- a/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh +++ b/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh @@ -13,7 +13,7 @@ check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING MODEL_PATH \ # Always clone and setup Dynamo echo "Cloning Dynamo repository..." git clone https://github.com/ai-dynamo/dynamo.git -cd dynamo && git checkout 80dfb82c5934aa1605105baed49403e74b83a779 && cd .. +cd dynamo && git checkout ishan/fp48k1k && cd .. # temporarily check out this branch until this is merged cd "$SGL_SLURM_JOBS_PATH" From 7a8e890b5b22305d037b87b82f388cba0b866661 Mon Sep 17 00:00:00 2001 From: Elnifio Date: Mon, 15 Dec 2025 11:15:58 -0800 Subject: [PATCH 14/23] update config --- .github/configs/nvidia-master.yaml | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 436d8c57f..ce714c250 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -911,7 +911,7 @@ dsr1-fp4-gb200-dynamo-sglang: search-space: # Low latency (1 prefill worker at DEP4 and 2 decode workers at DEP4) - spec-decoding: "none" - conc-list: [ 4, 8, 32, 64, 128, 112, 128, 256 ] + conc-list: [ 4, 8, 32, 64 ] prefill: num-worker: 1 tp: 1 @@ -1008,7 +1008,25 @@ dsr1-fp4-gb200-dynamo-sglang: additional-settings: - "DECODE_NODES=12" - spec-decoding: "none" - conc-list: [ 1024, 2048, 8192 ] + conc-list: [ 1024, 2048, ] + prefill: + num-worker: 10 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=10" + - "N_ADDITIONAL_FRONTENDS=8" + - "SCRIPT_MODE=8k1k-max-tpt" + decode: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=8" + - spec-decoding: "none" + conc-list: [ 8192 ] prefill: num-worker: 10 tp: 1 From ce40018c179b28f70710a23112b6e8d3773422a2 Mon Sep 17 00:00:00 2001 From: Elnifio Date: Mon, 15 Dec 2025 11:21:13 -0800 Subject: [PATCH 15/23] temporarily disable all other configs --- .github/configs/nvidia-master.yaml | 108 ++++++++++++++--------------- 1 file changed, 54 insertions(+), 54 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index ce714c250..9dcb53977 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -971,60 +971,60 @@ dsr1-fp4-gb200-dynamo-sglang: - isl: 8192 osl: 1024 search-space: - - spec-decoding: "none" - conc-list: [ 4, 8, 32, 64 ] - prefill: - num-worker: 1 - tp: 1 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "N_ADDITIONAL_FRONTENDS=8" - - "SCRIPT_MODE=8k1k-low-latency" - decode: - num-worker: 4 - tp: 1 - ep: 1 - dp-attn: true - additional-settings: - - "DECODE_NODES=4" - - spec-decoding: "none" - conc-list: [ 512, 1024, 2048, 4096 ] - prefill: - num-worker: 6 - tp: 1 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=6" - - "N_ADDITIONAL_FRONTENDS=9" - - "SCRIPT_MODE=8k1k-middle-curve" - decode: - num-worker: 1 - tp: 1 - ep: 1 - dp-attn: true - additional-settings: - - "DECODE_NODES=12" - - spec-decoding: "none" - conc-list: [ 1024, 2048, ] - prefill: - num-worker: 10 - tp: 1 - ep: 1 - dp-attn: true - additional-settings: - - "PREFILL_NODES=10" - - "N_ADDITIONAL_FRONTENDS=8" - - "SCRIPT_MODE=8k1k-max-tpt" - decode: - num-worker: 1 - tp: 1 - ep: 1 - dp-attn: true - additional-settings: - - "DECODE_NODES=8" + # - spec-decoding: "none" + # conc-list: [ 4, 8, 32, 64 ] + # prefill: + # num-worker: 1 + # tp: 1 + # ep: 1 + # dp-attn: false + # additional-settings: + # - "PREFILL_NODES=1" + # - "N_ADDITIONAL_FRONTENDS=8" + # - "SCRIPT_MODE=8k1k-low-latency" + # decode: + # num-worker: 4 + # tp: 1 + # ep: 1 + # dp-attn: true + # additional-settings: + # - "DECODE_NODES=4" + # - spec-decoding: "none" + # conc-list: [ 512, 1024, 2048, 4096 ] + # prefill: + # num-worker: 6 + # tp: 1 + # ep: 1 + # dp-attn: false + # additional-settings: + # - "PREFILL_NODES=6" + # - "N_ADDITIONAL_FRONTENDS=9" + # - "SCRIPT_MODE=8k1k-middle-curve" + # decode: + # num-worker: 1 + # tp: 1 + # ep: 1 + # dp-attn: true + # additional-settings: + # - "DECODE_NODES=12" + # - spec-decoding: "none" + # conc-list: [ 1024, 2048 ] + # prefill: + # num-worker: 10 + # tp: 1 + # ep: 1 + # dp-attn: true + # additional-settings: + # - "PREFILL_NODES=10" + # - "N_ADDITIONAL_FRONTENDS=8" + # - "SCRIPT_MODE=8k1k-max-tpt" + # decode: + # num-worker: 1 + # tp: 1 + # ep: 1 + # dp-attn: true + # additional-settings: + # - "DECODE_NODES=8" - spec-decoding: "none" conc-list: [ 8192 ] prefill: From 35c7eb3425316beaa2346fce2a088859ed37ebb4 Mon Sep 17 00:00:00 2001 From: Elnifio Date: Mon, 15 Dec 2025 16:05:32 -0800 Subject: [PATCH 16/23] Revert "temporarily disable all other configs" This reverts commit ce40018c179b28f70710a23112b6e8d3773422a2. --- .github/configs/nvidia-master.yaml | 108 ++++++++++++++--------------- 1 file changed, 54 insertions(+), 54 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 9dcb53977..ce714c250 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -971,60 +971,60 @@ dsr1-fp4-gb200-dynamo-sglang: - isl: 8192 osl: 1024 search-space: - # - spec-decoding: "none" - # conc-list: [ 4, 8, 32, 64 ] - # prefill: - # num-worker: 1 - # tp: 1 - # ep: 1 - # dp-attn: false - # additional-settings: - # - "PREFILL_NODES=1" - # - "N_ADDITIONAL_FRONTENDS=8" - # - "SCRIPT_MODE=8k1k-low-latency" - # decode: - # num-worker: 4 - # tp: 1 - # ep: 1 - # dp-attn: true - # additional-settings: - # - "DECODE_NODES=4" - # - spec-decoding: "none" - # conc-list: [ 512, 1024, 2048, 4096 ] - # prefill: - # num-worker: 6 - # tp: 1 - # ep: 1 - # dp-attn: false - # additional-settings: - # - "PREFILL_NODES=6" - # - "N_ADDITIONAL_FRONTENDS=9" - # - "SCRIPT_MODE=8k1k-middle-curve" - # decode: - # num-worker: 1 - # tp: 1 - # ep: 1 - # dp-attn: true - # additional-settings: - # - "DECODE_NODES=12" - # - spec-decoding: "none" - # conc-list: [ 1024, 2048 ] - # prefill: - # num-worker: 10 - # tp: 1 - # ep: 1 - # dp-attn: true - # additional-settings: - # - "PREFILL_NODES=10" - # - "N_ADDITIONAL_FRONTENDS=8" - # - "SCRIPT_MODE=8k1k-max-tpt" - # decode: - # num-worker: 1 - # tp: 1 - # ep: 1 - # dp-attn: true - # additional-settings: - # - "DECODE_NODES=8" + - spec-decoding: "none" + conc-list: [ 4, 8, 32, 64 ] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "N_ADDITIONAL_FRONTENDS=8" + - "SCRIPT_MODE=8k1k-low-latency" + decode: + num-worker: 4 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=4" + - spec-decoding: "none" + conc-list: [ 512, 1024, 2048, 4096 ] + prefill: + num-worker: 6 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=6" + - "N_ADDITIONAL_FRONTENDS=9" + - "SCRIPT_MODE=8k1k-middle-curve" + decode: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=12" + - spec-decoding: "none" + conc-list: [ 1024, 2048, ] + prefill: + num-worker: 10 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=10" + - "N_ADDITIONAL_FRONTENDS=8" + - "SCRIPT_MODE=8k1k-max-tpt" + decode: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=8" - spec-decoding: "none" conc-list: [ 8192 ] prefill: From b26d699e83edd3fc35dca1d80051ecb55345cb2b Mon Sep 17 00:00:00 2001 From: Elnifio Date: Mon, 15 Dec 2025 16:08:20 -0800 Subject: [PATCH 17/23] update comments --- benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh | 2 +- benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh b/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh index 70b9c699e..6b7d1c1b0 100644 --- a/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh +++ b/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh @@ -13,7 +13,7 @@ check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING MODEL_PATH \ # Always clone and setup Dynamo echo "Cloning Dynamo repository..." git clone https://github.com/ai-dynamo/dynamo.git -cd dynamo && git checkout ishan/fp48k1k && cd .. # temporarily check out this branch until this is merged +cd dynamo && git checkout ishan/fp48k1k && cd .. # All configs are now tracked in this branch cd "$SGL_SLURM_JOBS_PATH" diff --git a/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh b/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh index cabd17b7f..86debfb73 100644 --- a/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh +++ b/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh @@ -13,7 +13,7 @@ check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING MODEL_PATH \ # Always clone and setup Dynamo echo "Cloning Dynamo repository..." git clone https://github.com/ai-dynamo/dynamo.git -cd dynamo && git checkout ishan/fp48k1k && cd .. # temporarily check out this branch until this is merged +cd dynamo && git checkout ishan/fp48k1k && cd .. # All configs are now tracked in this branch cd "$SGL_SLURM_JOBS_PATH" From c1024db628922ba701e8ece021adda7e6df7f35a Mon Sep 17 00:00:00 2001 From: Elnifio Date: Tue, 16 Dec 2025 23:06:18 -0800 Subject: [PATCH 18/23] bump the image for DSR1 --- .github/configs/nvidia-master.yaml | 5 ++--- runners/launch_gb200-nv.sh | 12 ++++++------ 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index ce714c250..45a404c41 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -773,7 +773,7 @@ dsr1-fp4-gb200-dynamo-trt: - "DECODE_MTP_SIZE=0" dsr1-fp8-gb200-dynamo-sglang: - image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1 + image: lmsysorg/sglang:v0.5.5.post2 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 runner: gb200 @@ -895,8 +895,7 @@ dsr1-fp8-gb200-dynamo-sglang: - "DECODE_NODES=8" dsr1-fp4-gb200-dynamo-sglang: - # TODO: swap - image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1 + image: lmsysorg/sglang:v0.5.5.post2 # TODO: what is the right name? model: deepseek-ai/DeepSeek-R1-0528-fp4-v2 model-prefix: dsr1 diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index fdde90cf3..7b2dc1931 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -13,8 +13,13 @@ export SLURM_JOB_NAME="benchmark-dynamo.job" # For now we add conditionals to this script to use newer code for the 1k1k configs ### FRAMEWORK_DIFF_IF_STATEMENT #1 - difference in setting up envvars +SQUASH_FILE="/mnt/lustre01/users/sa-shared/images/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +srun --partition=$SLURM_PARTITION --exclusive --time=180 bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" + +# Update the IMAGE variable to the squash file +export IMAGE=$SQUASH_FILE + if [[ $FRAMEWORK == "dynamo-sglang" ]]; then - export IMAGE="/mnt/lustre01/artifacts/containers/lmsysorg+sglang+v0.5.5.post2.sqsh" if [[ $PRECISION == "fp4" ]]; then export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528-fp4-v2" @@ -25,11 +30,6 @@ if [[ $FRAMEWORK == "dynamo-sglang" ]]; then export CONFIG_DIR="/mnt/lustre01/artifacts/sglang-configs/1k1k" export SGL_SLURM_JOBS_PATH="dynamo/examples/backends/sglang/slurm_jobs" else - SQUASH_FILE="/mnt/lustre01/users/sa-shared/images/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" - srun --partition=$SLURM_PARTITION --exclusive --time=180 bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" - - # Update the IMAGE variable to the squash file - export IMAGE=$SQUASH_FILE export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528-fp4-v2" export SERVED_MODEL_NAME="deepseek-r1-fp4" From 35d75551d138150185e06dd308c9d33829a4e9de Mon Sep 17 00:00:00 2001 From: Elnifio Date: Wed, 17 Dec 2025 14:47:19 -0800 Subject: [PATCH 19/23] update the model-path args --- .github/configs/nvidia-master.yaml | 6 ++++++ .github/workflows/benchmark-multinode-tmpl.yml | 1 + runners/launch_gb200-nv.sh | 13 +++---------- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 45a404c41..a2e9cacc8 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -333,6 +333,8 @@ gptoss-fp4-h200-vllm: dsr1-fp4-gb200-dynamo-trt: image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3 model: deepseek-r1-fp4 + # Models are pre-downloaded to this path on GB200 runner to avoid repeated downloading + model-path: /mnt/lustre01/models/deepseek-r1-0528 model-prefix: dsr1 runner: gb200 precision: fp4 @@ -775,6 +777,8 @@ dsr1-fp4-gb200-dynamo-trt: dsr1-fp8-gb200-dynamo-sglang: image: lmsysorg/sglang:v0.5.5.post2 model: deepseek-ai/DeepSeek-R1-0528 + # Models are pre-downloaded to this path on GB200 runner to avoid repeated downloading + model-path: /mnt/lustre01/models/deepseek-r1-0528 model-prefix: dsr1 runner: gb200 precision: fp8 @@ -898,6 +902,8 @@ dsr1-fp4-gb200-dynamo-sglang: image: lmsysorg/sglang:v0.5.5.post2 # TODO: what is the right name? model: deepseek-ai/DeepSeek-R1-0528-fp4-v2 + # Models are pre-downloaded to this path on GB200 runner to avoid repeated downloading + model-path: /mnt/lustre01/models/deepseek-r1-0528-fp4-v2 model-prefix: dsr1 runner: gb200 precision: fp4 diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index 93de8faa0..0668ce749 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -86,6 +86,7 @@ env: EXP_NAME: ${{ inputs.exp-name }} IMAGE: ${{ inputs.image }} MODEL_PREFIX: ${{ inputs.model-prefix }} + MODEL_PATH: ${{ inputs.model-path }} FRAMEWORK: ${{ inputs.framework }} PRECISION: ${{ inputs.precision }} ISL: ${{ inputs.isl }} diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 7b2dc1931..fb02290c3 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -19,23 +19,16 @@ srun --partition=$SLURM_PARTITION --exclusive --time=180 bash -c "enroot import # Update the IMAGE variable to the squash file export IMAGE=$SQUASH_FILE -if [[ $FRAMEWORK == "dynamo-sglang" ]]; then - - if [[ $PRECISION == "fp4" ]]; then - export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528-fp4-v2" - else - export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528" - fi +# MODEL_PATH is set in `nvidia-master.yaml` or any other yaml files +export MODEL_PATH=$MODEL_PATH +if [[ $FRAMEWORK == "dynamo-sglang" ]]; then export CONFIG_DIR="/mnt/lustre01/artifacts/sglang-configs/1k1k" export SGL_SLURM_JOBS_PATH="dynamo/examples/backends/sglang/slurm_jobs" else - - export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528-fp4-v2" export SERVED_MODEL_NAME="deepseek-r1-fp4" fi - export ISL="$ISL" export OSL="$OSL" From 45cc883366882e59552d21155bd217ff603ca450 Mon Sep 17 00:00:00 2001 From: Elnifio Date: Wed, 17 Dec 2025 14:50:28 -0800 Subject: [PATCH 20/23] model-path not permitted --- .github/configs/nvidia-master.yaml | 11 +++++------ .github/workflows/benchmark-multinode-tmpl.yml | 2 +- runners/launch_gb200-nv.sh | 2 +- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index a2e9cacc8..ef59d30c0 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -332,9 +332,8 @@ gptoss-fp4-h200-vllm: dsr1-fp4-gb200-dynamo-trt: image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3 - model: deepseek-r1-fp4 # Models are pre-downloaded to this path on GB200 runner to avoid repeated downloading - model-path: /mnt/lustre01/models/deepseek-r1-0528 + model: /mnt/lustre01/models/deepseek-r1-0528 model-prefix: dsr1 runner: gb200 precision: fp4 @@ -776,9 +775,9 @@ dsr1-fp4-gb200-dynamo-trt: dsr1-fp8-gb200-dynamo-sglang: image: lmsysorg/sglang:v0.5.5.post2 - model: deepseek-ai/DeepSeek-R1-0528 + # model: deepseek-ai/DeepSeek-R1-0528 # Models are pre-downloaded to this path on GB200 runner to avoid repeated downloading - model-path: /mnt/lustre01/models/deepseek-r1-0528 + model: /mnt/lustre01/models/deepseek-r1-0528 model-prefix: dsr1 runner: gb200 precision: fp8 @@ -901,9 +900,9 @@ dsr1-fp8-gb200-dynamo-sglang: dsr1-fp4-gb200-dynamo-sglang: image: lmsysorg/sglang:v0.5.5.post2 # TODO: what is the right name? - model: deepseek-ai/DeepSeek-R1-0528-fp4-v2 + # model: deepseek-ai/DeepSeek-R1-0528-fp4-v2 # Models are pre-downloaded to this path on GB200 runner to avoid repeated downloading - model-path: /mnt/lustre01/models/deepseek-r1-0528-fp4-v2 + model: /mnt/lustre01/models/deepseek-r1-0528-fp4-v2 model-prefix: dsr1 runner: gb200 precision: fp4 diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index 0668ce749..90dc68d97 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -86,7 +86,7 @@ env: EXP_NAME: ${{ inputs.exp-name }} IMAGE: ${{ inputs.image }} MODEL_PREFIX: ${{ inputs.model-prefix }} - MODEL_PATH: ${{ inputs.model-path }} + MODEL: ${{ inputs.model }} FRAMEWORK: ${{ inputs.framework }} PRECISION: ${{ inputs.precision }} ISL: ${{ inputs.isl }} diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index fb02290c3..ff611bce8 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -20,7 +20,7 @@ srun --partition=$SLURM_PARTITION --exclusive --time=180 bash -c "enroot import export IMAGE=$SQUASH_FILE # MODEL_PATH is set in `nvidia-master.yaml` or any other yaml files -export MODEL_PATH=$MODEL_PATH +export MODEL_PATH=$MODEL if [[ $FRAMEWORK == "dynamo-sglang" ]]; then export CONFIG_DIR="/mnt/lustre01/artifacts/sglang-configs/1k1k" From a6cc1571fbe1a4a27d4b7acc819f6070490c2cd7 Mon Sep 17 00:00:00 2001 From: Elnifio Date: Wed, 17 Dec 2025 14:56:54 -0800 Subject: [PATCH 21/23] switches the branch --- benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh | 2 +- benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh b/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh index 6b7d1c1b0..eaf329cf8 100644 --- a/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh +++ b/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh @@ -13,7 +13,7 @@ check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING MODEL_PATH \ # Always clone and setup Dynamo echo "Cloning Dynamo repository..." git clone https://github.com/ai-dynamo/dynamo.git -cd dynamo && git checkout ishan/fp48k1k && cd .. # All configs are now tracked in this branch +cd dynamo && git checkout b7107d008392eded64c23a7540fb99bca46b4c91 && cd .. # All configs are frozen in this branch cd "$SGL_SLURM_JOBS_PATH" diff --git a/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh b/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh index 86debfb73..2ddcd7a95 100644 --- a/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh +++ b/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh @@ -13,7 +13,7 @@ check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING MODEL_PATH \ # Always clone and setup Dynamo echo "Cloning Dynamo repository..." git clone https://github.com/ai-dynamo/dynamo.git -cd dynamo && git checkout ishan/fp48k1k && cd .. # All configs are now tracked in this branch +cd dynamo && git checkout b7107d008392eded64c23a7540fb99bca46b4c91 && cd .. # All configs are frozen in this branch cd "$SGL_SLURM_JOBS_PATH" From b3ccea805f67f2445e4dfb96a051e68dbf334101 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 17 Dec 2025 23:14:49 +0000 Subject: [PATCH 22/23] add perf changelog --- perf-changelog.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 594f5fec3..699e6006d 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -95,3 +95,11 @@ description: - "Add benchmark script for GPTOSS FP4 B200 TRT-LLM" pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/256 + +- config-keys: + - dsr1-fp4-gb200-dynamo-trt + - dsr1-fp4-gb200-dynamo-sglang + - dsr1-fp8-gb200-dynamo-sglang + description: + - "Add more configurations for GB200 SGLang DSR1" + pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/335 From 00dcff74fe059167d5bfbaafbe7c7158230715fe Mon Sep 17 00:00:00 2001 From: Elnifio Date: Wed, 17 Dec 2025 16:31:34 -0800 Subject: [PATCH 23/23] used the wrong model path here... --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index ef59d30c0..be2c3a4d3 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -333,7 +333,7 @@ gptoss-fp4-h200-vllm: dsr1-fp4-gb200-dynamo-trt: image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3 # Models are pre-downloaded to this path on GB200 runner to avoid repeated downloading - model: /mnt/lustre01/models/deepseek-r1-0528 + model: /mnt/lustre01/models/deepseek-r1-0528-fp4-v2 model-prefix: dsr1 runner: gb200 precision: fp4