From 44868fc8a659c6f2b9daedd33ccabd0738388481 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Wed, 3 Dec 2025 16:39:42 -0800 Subject: [PATCH 1/4] go --- .github/configs/nvidia-master.yaml | 102 +++++++++++++++++- .../dsr1_fp8_gb200_dynamo-sglang_slurm.sh | 6 +- 2 files changed, 98 insertions(+), 10 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index c4370f483..3b094d572 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -852,17 +852,37 @@ dsr1-fp8-gb200-dynamo-sglang: - isl: 8192 osl: 1024 search-space: + # Low latency (1 prefill worker at DEP4 and 1 decode worker at DEP4) - spec-decoding: "none" - conc-list: [ 128, 256, 384, 448, 512, 576, 1024, 2048, 4096 ] + conc-list: [ 4, 8, 16, 32, 64, 128, 256, 512 ] prefill: - num-worker: 6 + num-worker: 1 # tp, ep, and dp-attn do nothing because they are hardcoded in the following file: - # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh tp: 1 ep: 1 dp-attn: true additional-settings: - - "PREFILL_NODES=12" + - "PREFILL_NODES=1" + - "N_ADDITIONAL_FRONTENDS=8" + decode: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + + # Middle and top of curve (5 prefill workers each at DEP8 and 1 decode worker at DEP32) + - spec-decoding: "none" + conc-list: [ 512, 1024, 2048, 6144 ] + prefill: + num-worker: 5 + # tp, ep, and dp-attn do nothing because they are hardcoded in the following file: + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" - "N_ADDITIONAL_FRONTENDS=8" decode: num-worker: 1 @@ -870,4 +890,76 @@ dsr1-fp8-gb200-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "DECODE_NODES=6" + - "DECODE_NODES=8" + + dsr1-fp8-gb200-dynamo-sglang: + # TODO: swap + image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1 + # TODO: what is the right name? + model: deepseek-ai/DeepSeek-R1-0528-fp4-v2 + model-prefix: dsr1 + runner: gb200 + precision: fp4 + framework: dynamo-sglang + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # Low latency (1 prefill worker at DEP4 and 2 decode workers at DEP4) + - spec-decoding: "none" + conc-list: [ 4, 8, 32, 64, 128, 112, 128, 256 ] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + - "N_ADDITIONAL_FRONTENDS=8" + decode: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=2" + + # Mid curve (1 prefill worker at DEP4 and 1 decode workers at DEP48) + - spec-decoding: "none" + conc-list: [ 512, 1024, 2048, 4096, 8192 ] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + - "N_ADDITIONAL_FRONTENDS=8" + decode: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=12" + + # Top of curve (1 prefill worker at DEP4 and 1 decode worker at DEP32) + - spec-decoding: "none" + conc-list: [ 8192, 12000, 15000 ] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + - "N_ADDITIONAL_FRONTENDS=8" + decode: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=8" \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh b/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh index 99e2c7afd..f538e3e29 100644 --- a/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh +++ b/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh @@ -12,11 +12,7 @@ check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING MODEL_PATH \ # Always clone and setup Dynamo echo "Cloning Dynamo repository..." -if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then - git clone --branch ishan/sa-1.1-sgl-dsr1-fp8 https://github.com/ai-dynamo/dynamo.git -else - git clone --branch update-result-file-name https://github.com/Elnifio/dynamo.git -fi +git clone --branch ishan/sa-1.1-sgl-dsr1 https://github.com/ai-dynamo/dynamo.git cd "$SGL_SLURM_JOBS_PATH" From 0e3c3598b1f5367a4165ee19bae7446b714755d3 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Wed, 3 Dec 2025 16:40:46 -0800 Subject: [PATCH 2/4] typo --- .github/configs/nvidia-master.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 3b094d572..881d68591 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -857,7 +857,6 @@ dsr1-fp8-gb200-dynamo-sglang: conc-list: [ 4, 8, 16, 32, 64, 128, 256, 512 ] prefill: num-worker: 1 - # tp, ep, and dp-attn do nothing because they are hardcoded in the following file: tp: 1 ep: 1 dp-attn: true @@ -877,7 +876,6 @@ dsr1-fp8-gb200-dynamo-sglang: conc-list: [ 512, 1024, 2048, 6144 ] prefill: num-worker: 5 - # tp, ep, and dp-attn do nothing because they are hardcoded in the following file: tp: 1 ep: 1 dp-attn: true @@ -892,7 +890,7 @@ dsr1-fp8-gb200-dynamo-sglang: additional-settings: - "DECODE_NODES=8" - dsr1-fp8-gb200-dynamo-sglang: + dsr1-fp4-gb200-dynamo-sglang: # TODO: swap image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1 # TODO: what is the right name? From 297bd7fc640345cb2f2b79e8ee5bebd251bf0b89 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Wed, 3 Dec 2025 16:42:31 -0800 Subject: [PATCH 3/4] typo... --- .github/configs/nvidia-master.yaml | 142 ++++++++++++++--------------- 1 file changed, 71 insertions(+), 71 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 881d68591..999d34441 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -890,74 +890,74 @@ dsr1-fp8-gb200-dynamo-sglang: additional-settings: - "DECODE_NODES=8" - dsr1-fp4-gb200-dynamo-sglang: - # TODO: swap - image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1 - # TODO: what is the right name? - model: deepseek-ai/DeepSeek-R1-0528-fp4-v2 - model-prefix: dsr1 - runner: gb200 - precision: fp4 - framework: dynamo-sglang - multinode: true - disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # Low latency (1 prefill worker at DEP4 and 2 decode workers at DEP4) - - spec-decoding: "none" - conc-list: [ 4, 8, 32, 64, 128, 112, 128, 256 ] - prefill: - num-worker: 1 - tp: 1 - ep: 1 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - - "N_ADDITIONAL_FRONTENDS=8" - decode: - num-worker: 2 - tp: 1 - ep: 1 - dp-attn: true - additional-settings: - - "DECODE_NODES=2" - - # Mid curve (1 prefill worker at DEP4 and 1 decode workers at DEP48) - - spec-decoding: "none" - conc-list: [ 512, 1024, 2048, 4096, 8192 ] - prefill: - num-worker: 1 - tp: 1 - ep: 1 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - - "N_ADDITIONAL_FRONTENDS=8" - decode: - num-worker: 2 - tp: 1 - ep: 1 - dp-attn: true - additional-settings: - - "DECODE_NODES=12" - - # Top of curve (1 prefill worker at DEP4 and 1 decode worker at DEP32) - - spec-decoding: "none" - conc-list: [ 8192, 12000, 15000 ] - prefill: - num-worker: 1 - tp: 1 - ep: 1 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - - "N_ADDITIONAL_FRONTENDS=8" - decode: - num-worker: 2 - tp: 1 - ep: 1 - dp-attn: true - additional-settings: - - "DECODE_NODES=8" \ No newline at end of file +dsr1-fp4-gb200-dynamo-sglang: + # TODO: swap + image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1 + # TODO: what is the right name? + model: deepseek-ai/DeepSeek-R1-0528-fp4-v2 + model-prefix: dsr1 + runner: gb200 + precision: fp4 + framework: dynamo-sglang + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # Low latency (1 prefill worker at DEP4 and 2 decode workers at DEP4) + - spec-decoding: "none" + conc-list: [ 4, 8, 32, 64, 128, 112, 128, 256 ] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + - "N_ADDITIONAL_FRONTENDS=8" + decode: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=2" + + # Mid curve (1 prefill worker at DEP4 and 1 decode workers at DEP48) + - spec-decoding: "none" + conc-list: [ 512, 1024, 2048, 4096, 8192 ] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + - "N_ADDITIONAL_FRONTENDS=8" + decode: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=12" + + # Top of curve (1 prefill worker at DEP4 and 1 decode worker at DEP32) + - spec-decoding: "none" + conc-list: [ 8192, 12000, 15000 ] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + - "N_ADDITIONAL_FRONTENDS=8" + decode: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=8" \ No newline at end of file From 2cc19a3ec4e245ee6ce868e1fe3c01a8782e723a Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Wed, 3 Dec 2025 16:50:16 -0800 Subject: [PATCH 4/4] more --- .github/configs/nvidia-master.yaml | 1 + .../dsr1_fp4_gb200_dynamo-sglang_slurm.sh | 37 +++++++++++++++++++ .../dsr1_fp8_gb200_dynamo-sglang_slurm.sh | 2 + 3 files changed, 40 insertions(+) create mode 100644 benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 999d34441..71fb257e8 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -798,6 +798,7 @@ dsr1-fp8-gb200-dynamo-sglang: additional-settings: - "PREFILL_NODES=4" - "N_ADDITIONAL_FRONTENDS=9" + - "SCRIPT_MODE=max-tpt" decode: num-worker: 1 tp: 1 diff --git a/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh b/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh new file mode 100644 index 000000000..7a668f30c --- /dev/null +++ b/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh @@ -0,0 +1,37 @@ + +#!/bin/bash + +set -x + +source "$(dirname "$0")/benchmark_lib.sh" + +check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING MODEL_PATH \ + PREFILL_NUM_WORKERS PREFILL_TP PREFILL_EP PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS DECODE_TP DECODE_EP DECODE_DP_ATTN \ + PREFILL_NODES DECODE_NODES N_ADDITIONAL_FRONTENDS SGL_SLURM_JOBS_PATH # SGL_SLURM_JOBS_PATH FIXME + +# Always clone and setup Dynamo +echo "Cloning Dynamo repository..." +git clone --branch ishan/sa-1.1-sgl-dsr1 https://github.com/ai-dynamo/dynamo.git + +cd "$SGL_SLURM_JOBS_PATH" + +# Set up SGL launch script-specific environment variables +export TIME_LIMIT="04:00:00" +export MODEL_PATH=$MODEL_PATH +export CONFIG_DIR=$CONFIG_DIR +export CONTAINER_IMAGE=$IMAGE +export GPU_TYPE="gb200-fp4" + +# Launch jobs based on ISL/OSL +# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented +# by a list of numbers delimted by 'x'. This is because of how the underlying launch script +# expects the concurrencies. +bash ./submit_disagg.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $N_ADDITIONAL_FRONTENDS \ + $ISL $OSL "${CONC_LIST// /x}" inf \ + $GPU_TYPE \ + $SCRIPT_MODE \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh b/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh index f538e3e29..4e44b0414 100644 --- a/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh +++ b/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh @@ -21,6 +21,7 @@ export TIME_LIMIT="04:00:00" export MODEL_PATH=$MODEL_PATH export CONFIG_DIR=$CONFIG_DIR export CONTAINER_IMAGE=$IMAGE +export GPU_TYPE="gb200-fp8" # Launch jobs based on ISL/OSL # Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented @@ -32,4 +33,5 @@ bash ./submit_disagg.sh $PREFILL_NODES \ $DECODE_NUM_WORKERS \ $N_ADDITIONAL_FRONTENDS \ $ISL $OSL "${CONC_LIST// /x}" inf \ + $GPU_TYPE \ $SCRIPT_MODE \ No newline at end of file