Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
a1a0325
bring all configs here
Elnifio Dec 9, 2025
c03076b
test for GB200 only
Elnifio Dec 9, 2025
028f224
updates the files and git clone urls
Elnifio Dec 9, 2025
25a19b1
update the prefill nodes
Elnifio Dec 10, 2025
124ddf4
update 1k1k fp4 config
Elnifio Dec 10, 2025
6199031
updates to run 1k1k fp4 only
Elnifio Dec 10, 2025
344ac6c
updates the FP4 8k1k
Elnifio Dec 10, 2025
355773a
update the model path
Elnifio Dec 10, 2025
0dd1e5a
restore changes to full sweeps
Elnifio Dec 10, 2025
7da0be5
updates the config for 1k1k fp4
Elnifio Dec 11, 2025
b38b633
temporarily disable some concurrencies
Elnifio Dec 11, 2025
8136816
updates the params
Elnifio Dec 12, 2025
c1f1be4
updates the branch
Elnifio Dec 12, 2025
7a8e890
update config
Elnifio Dec 15, 2025
ce40018
temporarily disable all other configs
Elnifio Dec 15, 2025
35c7eb3
Revert "temporarily disable all other configs"
Elnifio Dec 16, 2025
b26d699
update comments
Elnifio Dec 16, 2025
5b0509a
Merge branch 'main' into ishan/moreconfigs
cquil11 Dec 17, 2025
c1024db
bump the image for DSR1
Elnifio Dec 17, 2025
3d4c3ae
Merge branch 'main' into ishan/moreconfigs
yunzhoul-nv Dec 17, 2025
35d7555
update the model-path args
Elnifio Dec 17, 2025
45cc883
model-path not permitted
Elnifio Dec 17, 2025
a6cc157
switches the branch
Elnifio Dec 17, 2025
2731ccb
Merge branch 'main' into ishan/moreconfigs
cquil11 Dec 17, 2025
b3ccea8
add perf changelog
cquil11 Dec 17, 2025
00dcff7
used the wrong model path here...
Elnifio Dec 18, 2025
e845bdd
Merge branch 'main' into ishan/moreconfigs
cquil11 Dec 18, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
194 changes: 185 additions & 9 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,8 @@ gptoss-fp4-h200-vllm:

dsr1-fp4-gb200-dynamo-trt:
image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3
model: deepseek-r1-fp4
# Models are pre-downloaded to this path on GB200 runner to avoid repeated downloading
model: /mnt/lustre01/models/deepseek-r1-0528-fp4-v2
model-prefix: dsr1
runner: gb200
precision: fp4
Expand Down Expand Up @@ -773,8 +774,10 @@ dsr1-fp4-gb200-dynamo-trt:
- "DECODE_MTP_SIZE=0"

dsr1-fp8-gb200-dynamo-sglang:
image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1
model: deepseek-ai/DeepSeek-R1-0528
image: lmsysorg/sglang:v0.5.5.post2
# model: deepseek-ai/DeepSeek-R1-0528
# Models are pre-downloaded to this path on GB200 runner to avoid repeated downloading
model: /mnt/lustre01/models/deepseek-r1-0528
model-prefix: dsr1
runner: gb200
precision: fp8
Expand All @@ -798,6 +801,7 @@ dsr1-fp8-gb200-dynamo-sglang:
additional-settings:
- "PREFILL_NODES=4"
- "N_ADDITIONAL_FRONTENDS=9"
- "SCRIPT_MODE=1k1k-max-tpt"
decode:
num-worker: 1
tp: 1
Expand All @@ -819,7 +823,7 @@ dsr1-fp8-gb200-dynamo-sglang:
additional-settings:
- "PREFILL_NODES=1"
- "N_ADDITIONAL_FRONTENDS=9"
- "SCRIPT_MODE=1p_4d"
- "SCRIPT_MODE=1k1k-low-latency"
decode:
num-worker: 4
tp: 1
Expand All @@ -841,6 +845,7 @@ dsr1-fp8-gb200-dynamo-sglang:
additional-settings:
- "PREFILL_NODES=6"
- "N_ADDITIONAL_FRONTENDS=9"
- "SCRIPT_MODE=1k1k-max-tpt"
decode:
num-worker: 1
tp: 1
Expand All @@ -852,22 +857,193 @@ dsr1-fp8-gb200-dynamo-sglang:
- isl: 8192
osl: 1024
search-space:
# Low latency (1 prefill worker at DEP4 and 1 decode worker at DEP4)
- spec-decoding: "none"
conc-list: [ 4, 8, 16, 32, 64, 128, 256, 512 ]
prefill:
num-worker: 1
tp: 1
ep: 1
dp-attn: true
additional-settings:
- "PREFILL_NODES=1"
- "N_ADDITIONAL_FRONTENDS=8"
- "SCRIPT_MODE=8k1k-low-latency"
decode:
num-worker: 1
tp: 1
ep: 1
dp-attn: true
additional-settings:
- "DECODE_NODES=1"

# Middle and top of curve (5 prefill workers each at DEP8 and 1 decode worker at DEP32)
- spec-decoding: "none"
conc-list: [ 512, 1024, 2048, 6144 ]
prefill:
num-worker: 5
tp: 1
ep: 1
dp-attn: true
additional-settings:
- "PREFILL_NODES=10"
- "N_ADDITIONAL_FRONTENDS=8"
- "SCRIPT_MODE=8k1k-max-tpt"
decode:
num-worker: 1
tp: 1
ep: 1
dp-attn: true
additional-settings:
- "DECODE_NODES=8"

dsr1-fp4-gb200-dynamo-sglang:
image: lmsysorg/sglang:v0.5.5.post2
# TODO: what is the right name?
# model: deepseek-ai/DeepSeek-R1-0528-fp4-v2
# Models are pre-downloaded to this path on GB200 runner to avoid repeated downloading
model: /mnt/lustre01/models/deepseek-r1-0528-fp4-v2
model-prefix: dsr1
runner: gb200
precision: fp4
framework: dynamo-sglang
multinode: true
disagg: true
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
# Low latency (1 prefill worker at DEP4 and 2 decode workers at DEP4)
- spec-decoding: "none"
conc-list: [ 4, 8, 32, 64 ]
prefill:
num-worker: 1
tp: 1
ep: 1
dp-attn: true
additional-settings:
- "PREFILL_NODES=1"
- "N_ADDITIONAL_FRONTENDS=8"
- "SCRIPT_MODE=1k1k-low-latency"
decode:
num-worker: 2
tp: 1
ep: 1
dp-attn: true
additional-settings:
- "DECODE_NODES=2"

# Mid curve (1 prefill worker at DEP4 and 1 decode workers at DEP48)
- spec-decoding: "none"
conc-list: [ 128, 256, 384, 448, 512, 576, 1024, 2048, 4096 ]
conc-list: [ 512, 1024, 2048, 4096, 8192 ]
prefill:
num-worker: 4
tp: 1
ep: 1
dp-attn: true
additional-settings:
- "PREFILL_NODES=4"
- "N_ADDITIONAL_FRONTENDS=8"
- "SCRIPT_MODE=1k1k-middle-curve"
decode:
num-worker: 1
tp: 1
ep: 1
dp-attn: true
additional-settings:
- "DECODE_NODES=12"

# Top of curve (1 prefill worker at DEP4 and 1 decode worker at DEP32)
- spec-decoding: "none"
conc-list: [ 8192, 12000, 15000 ]
prefill:
num-worker: 4
tp: 1
ep: 1
dp-attn: true
additional-settings:
- "PREFILL_NODES=4"
- "N_ADDITIONAL_FRONTENDS=8"
- "SCRIPT_MODE=1k1k-max-tpt"
decode:
num-worker: 1
tp: 1
ep: 1
dp-attn: true
additional-settings:
- "DECODE_NODES=8"
- isl: 8192
osl: 1024
search-space:
- spec-decoding: "none"
conc-list: [ 4, 8, 32, 64 ]
prefill:
num-worker: 1
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
- "N_ADDITIONAL_FRONTENDS=8"
- "SCRIPT_MODE=8k1k-low-latency"
decode:
num-worker: 4
tp: 1
ep: 1
dp-attn: true
additional-settings:
- "DECODE_NODES=4"
- spec-decoding: "none"
conc-list: [ 512, 1024, 2048, 4096 ]
prefill:
num-worker: 6
# tp, ep, and dp-attn do nothing because they are hardcoded in the following file:
# https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=6"
- "N_ADDITIONAL_FRONTENDS=9"
- "SCRIPT_MODE=8k1k-middle-curve"
decode:
num-worker: 1
tp: 1
ep: 1
dp-attn: true
additional-settings:
- "PREFILL_NODES=12"
- "DECODE_NODES=12"
- spec-decoding: "none"
conc-list: [ 1024, 2048, ]
prefill:
num-worker: 10
tp: 1
ep: 1
dp-attn: true
additional-settings:
- "PREFILL_NODES=10"
- "N_ADDITIONAL_FRONTENDS=8"
- "SCRIPT_MODE=8k1k-max-tpt"
decode:
num-worker: 1
tp: 1
ep: 1
dp-attn: true
additional-settings:
- "DECODE_NODES=6"
- "DECODE_NODES=8"
- spec-decoding: "none"
conc-list: [ 8192 ]
prefill:
num-worker: 10
tp: 1
ep: 1
dp-attn: true
additional-settings:
- "PREFILL_NODES=10"
- "N_ADDITIONAL_FRONTENDS=8"
- "SCRIPT_MODE=8k1k-max-tpt"
decode:
num-worker: 1
tp: 1
ep: 1
dp-attn: true
additional-settings:
- "DECODE_NODES=8"
1 change: 1 addition & 0 deletions .github/workflows/benchmark-multinode-tmpl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ env:
EXP_NAME: ${{ inputs.exp-name }}
IMAGE: ${{ inputs.image }}
MODEL_PREFIX: ${{ inputs.model-prefix }}
MODEL: ${{ inputs.model }}
FRAMEWORK: ${{ inputs.framework }}
PRECISION: ${{ inputs.precision }}
ISL: ${{ inputs.isl }}
Expand Down
38 changes: 38 additions & 0 deletions benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@

#!/bin/bash

set -x

source "$(dirname "$0")/benchmark_lib.sh"

check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING MODEL_PATH \
PREFILL_NUM_WORKERS PREFILL_TP PREFILL_EP PREFILL_DP_ATTN \
DECODE_NUM_WORKERS DECODE_TP DECODE_EP DECODE_DP_ATTN \
PREFILL_NODES DECODE_NODES N_ADDITIONAL_FRONTENDS SGL_SLURM_JOBS_PATH # SGL_SLURM_JOBS_PATH FIXME

# Always clone and setup Dynamo
echo "Cloning Dynamo repository..."
git clone https://github.com/ai-dynamo/dynamo.git
cd dynamo && git checkout b7107d008392eded64c23a7540fb99bca46b4c91 && cd .. # All configs are frozen in this branch

cd "$SGL_SLURM_JOBS_PATH"

# Set up SGL launch script-specific environment variables
export TIME_LIMIT="04:00:00"
export MODEL_PATH=$MODEL_PATH
export CONFIG_DIR=$CONFIG_DIR
export CONTAINER_IMAGE=$IMAGE
export GPU_TYPE="gb200-fp4"

# Launch jobs based on ISL/OSL
# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented
# by a list of numbers delimted by 'x'. This is because of how the underlying launch script
# expects the concurrencies.
bash ./submit_disagg.sh $PREFILL_NODES \
$PREFILL_NUM_WORKERS \
$DECODE_NODES \
$DECODE_NUM_WORKERS \
$N_ADDITIONAL_FRONTENDS \
$ISL $OSL "${CONC_LIST// /x}" inf \
$GPU_TYPE \
$SCRIPT_MODE
11 changes: 5 additions & 6 deletions benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,8 @@ check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING MODEL_PATH \

# Always clone and setup Dynamo
echo "Cloning Dynamo repository..."
if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
git clone --branch ishan/sa-1.1-sgl-dsr1-fp8 https://github.com/ai-dynamo/dynamo.git
else
git clone --branch update-result-file-name https://github.com/Elnifio/dynamo.git
fi
git clone https://github.com/ai-dynamo/dynamo.git
cd dynamo && git checkout b7107d008392eded64c23a7540fb99bca46b4c91 && cd .. # All configs are frozen in this branch

cd "$SGL_SLURM_JOBS_PATH"

Expand All @@ -25,6 +22,7 @@ export TIME_LIMIT="04:00:00"
export MODEL_PATH=$MODEL_PATH
export CONFIG_DIR=$CONFIG_DIR
export CONTAINER_IMAGE=$IMAGE
export GPU_TYPE="gb200-fp8"

# Launch jobs based on ISL/OSL
# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented
Expand All @@ -36,4 +34,5 @@ bash ./submit_disagg.sh $PREFILL_NODES \
$DECODE_NUM_WORKERS \
$N_ADDITIONAL_FRONTENDS \
$ISL $OSL "${CONC_LIST// /x}" inf \
$SCRIPT_MODE
$GPU_TYPE \
$SCRIPT_MODE
8 changes: 8 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -95,3 +95,11 @@
description:
- "Add benchmark script for GPTOSS FP4 B200 TRT-LLM"
pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/256

- config-keys:
- dsr1-fp4-gb200-dynamo-trt
- dsr1-fp4-gb200-dynamo-sglang
- dsr1-fp8-gb200-dynamo-sglang
description:
- "Add more configurations for GB200 SGLang DSR1"
pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/335
35 changes: 11 additions & 24 deletions runners/launch_gb200-nv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,35 +13,22 @@ export SLURM_JOB_NAME="benchmark-dynamo.job"
# For now we add conditionals to this script to use newer code for the 1k1k configs

### FRAMEWORK_DIFF_IF_STATEMENT #1 - difference in setting up envvars
if [[ $FRAMEWORK == "dynamo-sglang" ]]; then
# Set IMAGE based on ISL/OSL
if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
export IMAGE="/mnt/lustre01/artifacts/containers/lmsysorg+sglang+v0.5.5.post2.sqsh"
else
export IMAGE="/mnt/lustre01/artifacts/containers/dynamo-sglang.sqsh"
fi
export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528"
export CONFIG_DIR="/mnt/lustre01/artifacts/sglang-configs/1k1k"
SQUASH_FILE="/mnt/lustre01/users/sa-shared/images/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice! thanks

srun --partition=$SLURM_PARTITION --exclusive --time=180 bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE"

# FIXME: Another workaround for all the different branching
# THIS NEEDS TO BE STANDARDIZED ASAP
if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
export SGL_SLURM_JOBS_PATH="dynamo/examples/backends/sglang/slurm_jobs"
else
export SGL_SLURM_JOBS_PATH="dynamo/components/backends/sglang/slurm_jobs"
fi
else
SQUASH_FILE="/mnt/lustre01/users/sa-shared/images/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
srun --partition=$SLURM_PARTITION --exclusive --time=180 bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE"
# Update the IMAGE variable to the squash file
export IMAGE=$SQUASH_FILE

# Update the IMAGE variable to the squash file
export IMAGE=$SQUASH_FILE
# MODEL_PATH is set in `nvidia-master.yaml` or any other yaml files
export MODEL_PATH=$MODEL

export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528-fp4-v2"
if [[ $FRAMEWORK == "dynamo-sglang" ]]; then
export CONFIG_DIR="/mnt/lustre01/artifacts/sglang-configs/1k1k"
export SGL_SLURM_JOBS_PATH="dynamo/examples/backends/sglang/slurm_jobs"
else
export SERVED_MODEL_NAME="deepseek-r1-fp4"
fi


export ISL="$ISL"
export OSL="$OSL"

Expand Down Expand Up @@ -148,4 +135,4 @@ PY
done
fi

echo "All result files processed"
echo "All result files processed"