diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 1242386de..be2c3a4d3 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -332,7 +332,8 @@ gptoss-fp4-h200-vllm: dsr1-fp4-gb200-dynamo-trt: image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3 - model: deepseek-r1-fp4 + # Models are pre-downloaded to this path on GB200 runner to avoid repeated downloading + model: /mnt/lustre01/models/deepseek-r1-0528-fp4-v2 model-prefix: dsr1 runner: gb200 precision: fp4 @@ -773,8 +774,10 @@ dsr1-fp4-gb200-dynamo-trt: - "DECODE_MTP_SIZE=0" dsr1-fp8-gb200-dynamo-sglang: - image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1 - model: deepseek-ai/DeepSeek-R1-0528 + image: lmsysorg/sglang:v0.5.5.post2 + # model: deepseek-ai/DeepSeek-R1-0528 + # Models are pre-downloaded to this path on GB200 runner to avoid repeated downloading + model: /mnt/lustre01/models/deepseek-r1-0528 model-prefix: dsr1 runner: gb200 precision: fp8 @@ -798,6 +801,7 @@ dsr1-fp8-gb200-dynamo-sglang: additional-settings: - "PREFILL_NODES=4" - "N_ADDITIONAL_FRONTENDS=9" + - "SCRIPT_MODE=1k1k-max-tpt" decode: num-worker: 1 tp: 1 @@ -819,7 +823,7 @@ dsr1-fp8-gb200-dynamo-sglang: additional-settings: - "PREFILL_NODES=1" - "N_ADDITIONAL_FRONTENDS=9" - - "SCRIPT_MODE=1p_4d" + - "SCRIPT_MODE=1k1k-low-latency" decode: num-worker: 4 tp: 1 @@ -841,6 +845,7 @@ dsr1-fp8-gb200-dynamo-sglang: additional-settings: - "PREFILL_NODES=6" - "N_ADDITIONAL_FRONTENDS=9" + - "SCRIPT_MODE=1k1k-max-tpt" decode: num-worker: 1 tp: 1 @@ -852,22 +857,193 @@ dsr1-fp8-gb200-dynamo-sglang: - isl: 8192 osl: 1024 search-space: + # Low latency (1 prefill worker at DEP4 and 1 decode worker at DEP4) + - spec-decoding: "none" + conc-list: [ 4, 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + - "N_ADDITIONAL_FRONTENDS=8" + - "SCRIPT_MODE=8k1k-low-latency" + decode: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + + # Middle and top of curve (5 prefill workers each at DEP8 and 1 decode worker at DEP32) + - spec-decoding: "none" + conc-list: [ 512, 1024, 2048, 6144 ] + prefill: + num-worker: 5 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=10" + - "N_ADDITIONAL_FRONTENDS=8" + - "SCRIPT_MODE=8k1k-max-tpt" + decode: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=8" + +dsr1-fp4-gb200-dynamo-sglang: + image: lmsysorg/sglang:v0.5.5.post2 + # TODO: what is the right name? + # model: deepseek-ai/DeepSeek-R1-0528-fp4-v2 + # Models are pre-downloaded to this path on GB200 runner to avoid repeated downloading + model: /mnt/lustre01/models/deepseek-r1-0528-fp4-v2 + model-prefix: dsr1 + runner: gb200 + precision: fp4 + framework: dynamo-sglang + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # Low latency (1 prefill worker at DEP4 and 2 decode workers at DEP4) + - spec-decoding: "none" + conc-list: [ 4, 8, 32, 64 ] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + - "N_ADDITIONAL_FRONTENDS=8" + - "SCRIPT_MODE=1k1k-low-latency" + decode: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=2" + + # Mid curve (1 prefill worker at DEP4 and 1 decode workers at DEP48) - spec-decoding: "none" - conc-list: [ 128, 256, 384, 448, 512, 576, 1024, 2048, 4096 ] + conc-list: [ 512, 1024, 2048, 4096, 8192 ] + prefill: + num-worker: 4 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=4" + - "N_ADDITIONAL_FRONTENDS=8" + - "SCRIPT_MODE=1k1k-middle-curve" + decode: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=12" + + # Top of curve (1 prefill worker at DEP4 and 1 decode worker at DEP32) + - spec-decoding: "none" + conc-list: [ 8192, 12000, 15000 ] + prefill: + num-worker: 4 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=4" + - "N_ADDITIONAL_FRONTENDS=8" + - "SCRIPT_MODE=1k1k-max-tpt" + decode: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=8" + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 4, 8, 32, 64 ] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "N_ADDITIONAL_FRONTENDS=8" + - "SCRIPT_MODE=8k1k-low-latency" + decode: + num-worker: 4 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=4" + - spec-decoding: "none" + conc-list: [ 512, 1024, 2048, 4096 ] prefill: num-worker: 6 - # tp, ep, and dp-attn do nothing because they are hardcoded in the following file: - # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=6" + - "N_ADDITIONAL_FRONTENDS=9" + - "SCRIPT_MODE=8k1k-middle-curve" + decode: + num-worker: 1 tp: 1 ep: 1 dp-attn: true additional-settings: - - "PREFILL_NODES=12" + - "DECODE_NODES=12" + - spec-decoding: "none" + conc-list: [ 1024, 2048, ] + prefill: + num-worker: 10 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=10" - "N_ADDITIONAL_FRONTENDS=8" + - "SCRIPT_MODE=8k1k-max-tpt" decode: num-worker: 1 tp: 1 ep: 1 dp-attn: true additional-settings: - - "DECODE_NODES=6" + - "DECODE_NODES=8" + - spec-decoding: "none" + conc-list: [ 8192 ] + prefill: + num-worker: 10 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=10" + - "N_ADDITIONAL_FRONTENDS=8" + - "SCRIPT_MODE=8k1k-max-tpt" + decode: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=8" diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index 93de8faa0..90dc68d97 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -86,6 +86,7 @@ env: EXP_NAME: ${{ inputs.exp-name }} IMAGE: ${{ inputs.image }} MODEL_PREFIX: ${{ inputs.model-prefix }} + MODEL: ${{ inputs.model }} FRAMEWORK: ${{ inputs.framework }} PRECISION: ${{ inputs.precision }} ISL: ${{ inputs.isl }} diff --git a/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh b/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh new file mode 100644 index 000000000..eaf329cf8 --- /dev/null +++ b/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh @@ -0,0 +1,38 @@ + +#!/bin/bash + +set -x + +source "$(dirname "$0")/benchmark_lib.sh" + +check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING MODEL_PATH \ + PREFILL_NUM_WORKERS PREFILL_TP PREFILL_EP PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS DECODE_TP DECODE_EP DECODE_DP_ATTN \ + PREFILL_NODES DECODE_NODES N_ADDITIONAL_FRONTENDS SGL_SLURM_JOBS_PATH # SGL_SLURM_JOBS_PATH FIXME + +# Always clone and setup Dynamo +echo "Cloning Dynamo repository..." +git clone https://github.com/ai-dynamo/dynamo.git +cd dynamo && git checkout b7107d008392eded64c23a7540fb99bca46b4c91 && cd .. # All configs are frozen in this branch + +cd "$SGL_SLURM_JOBS_PATH" + +# Set up SGL launch script-specific environment variables +export TIME_LIMIT="04:00:00" +export MODEL_PATH=$MODEL_PATH +export CONFIG_DIR=$CONFIG_DIR +export CONTAINER_IMAGE=$IMAGE +export GPU_TYPE="gb200-fp4" + +# Launch jobs based on ISL/OSL +# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented +# by a list of numbers delimted by 'x'. This is because of how the underlying launch script +# expects the concurrencies. +bash ./submit_disagg.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $N_ADDITIONAL_FRONTENDS \ + $ISL $OSL "${CONC_LIST// /x}" inf \ + $GPU_TYPE \ + $SCRIPT_MODE diff --git a/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh b/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh index 99e2c7afd..2ddcd7a95 100644 --- a/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh +++ b/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh @@ -12,11 +12,8 @@ check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING MODEL_PATH \ # Always clone and setup Dynamo echo "Cloning Dynamo repository..." -if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then - git clone --branch ishan/sa-1.1-sgl-dsr1-fp8 https://github.com/ai-dynamo/dynamo.git -else - git clone --branch update-result-file-name https://github.com/Elnifio/dynamo.git -fi +git clone https://github.com/ai-dynamo/dynamo.git +cd dynamo && git checkout b7107d008392eded64c23a7540fb99bca46b4c91 && cd .. # All configs are frozen in this branch cd "$SGL_SLURM_JOBS_PATH" @@ -25,6 +22,7 @@ export TIME_LIMIT="04:00:00" export MODEL_PATH=$MODEL_PATH export CONFIG_DIR=$CONFIG_DIR export CONTAINER_IMAGE=$IMAGE +export GPU_TYPE="gb200-fp8" # Launch jobs based on ISL/OSL # Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented @@ -36,4 +34,5 @@ bash ./submit_disagg.sh $PREFILL_NODES \ $DECODE_NUM_WORKERS \ $N_ADDITIONAL_FRONTENDS \ $ISL $OSL "${CONC_LIST// /x}" inf \ - $SCRIPT_MODE \ No newline at end of file + $GPU_TYPE \ + $SCRIPT_MODE diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 594f5fec3..699e6006d 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -95,3 +95,11 @@ description: - "Add benchmark script for GPTOSS FP4 B200 TRT-LLM" pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/256 + +- config-keys: + - dsr1-fp4-gb200-dynamo-trt + - dsr1-fp4-gb200-dynamo-sglang + - dsr1-fp8-gb200-dynamo-sglang + description: + - "Add more configurations for GB200 SGLang DSR1" + pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/335 diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index d9164469e..ff611bce8 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -13,35 +13,22 @@ export SLURM_JOB_NAME="benchmark-dynamo.job" # For now we add conditionals to this script to use newer code for the 1k1k configs ### FRAMEWORK_DIFF_IF_STATEMENT #1 - difference in setting up envvars -if [[ $FRAMEWORK == "dynamo-sglang" ]]; then - # Set IMAGE based on ISL/OSL - if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then - export IMAGE="/mnt/lustre01/artifacts/containers/lmsysorg+sglang+v0.5.5.post2.sqsh" - else - export IMAGE="/mnt/lustre01/artifacts/containers/dynamo-sglang.sqsh" - fi - export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528" - export CONFIG_DIR="/mnt/lustre01/artifacts/sglang-configs/1k1k" +SQUASH_FILE="/mnt/lustre01/users/sa-shared/images/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +srun --partition=$SLURM_PARTITION --exclusive --time=180 bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" - # FIXME: Another workaround for all the different branching - # THIS NEEDS TO BE STANDARDIZED ASAP - if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then - export SGL_SLURM_JOBS_PATH="dynamo/examples/backends/sglang/slurm_jobs" - else - export SGL_SLURM_JOBS_PATH="dynamo/components/backends/sglang/slurm_jobs" - fi -else - SQUASH_FILE="/mnt/lustre01/users/sa-shared/images/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" - srun --partition=$SLURM_PARTITION --exclusive --time=180 bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" +# Update the IMAGE variable to the squash file +export IMAGE=$SQUASH_FILE - # Update the IMAGE variable to the squash file - export IMAGE=$SQUASH_FILE +# MODEL_PATH is set in `nvidia-master.yaml` or any other yaml files +export MODEL_PATH=$MODEL - export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528-fp4-v2" +if [[ $FRAMEWORK == "dynamo-sglang" ]]; then + export CONFIG_DIR="/mnt/lustre01/artifacts/sglang-configs/1k1k" + export SGL_SLURM_JOBS_PATH="dynamo/examples/backends/sglang/slurm_jobs" +else export SERVED_MODEL_NAME="deepseek-r1-fp4" fi - export ISL="$ISL" export OSL="$OSL" @@ -148,4 +135,4 @@ PY done fi -echo "All result files processed" \ No newline at end of file +echo "All result files processed"