From 98982c619f8310cb6c8ca3430cdb5c18b442942a Mon Sep 17 00:00:00 2001 From: Jatin Gangani Date: Wed, 4 Feb 2026 16:32:38 -0800 Subject: [PATCH 01/10] Update GB200 and GB300 SGLANG FP8 DSR1 configs --- .github/configs/nvidia-master.yaml | 228 +++++++++++++++++++++-------- perf-changelog.yaml | 10 ++ runners/launch_gb200-nv.sh | 46 +----- 3 files changed, 177 insertions(+), 107 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index a5cad5206..cff3839e4 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2862,9 +2862,9 @@ dsr1-fp8-gb200-dynamo-trt: dsr1-fp8-gb200-dynamo-sglang: - image: lmsysorg/sglang:v0.5.5.post2 + image: lmsysorg/sglang:v0.5.8-cu130 model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 + model-prefix: dsr1-fp8 runner: gb200 precision: fp8 framework: dynamo-sglang @@ -2874,114 +2874,216 @@ dsr1-fp8-gb200-dynamo-sglang: - isl: 1024 osl: 1024 search-space: - # "Top of curve" (2 prefill workers each at DEP8 and 1 decode worker at DEP32) - - spec-decoding: "none" - conc-list: [ 4096 ] + # "Low latency" (1 prefill worker at TP4 and 1 decode worker at TP4) + - conc-list: [4, 8] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/1k1k/low-latency.yaml + - "CONFIG_FILE=recipes/gb200-fp8/1k1k/low-latency.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + + # "Mid curve" (3 prefill workers at DEP8 and 1 decode worker at DEP48) + - conc-list: [1024, 2048, 4096] + prefill: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/1k1k/mid-curve.yaml + - "CONFIG_FILE=recipes/gb200-fp8/1k1k/mid-curve.yaml" + decode: + num-worker: 1 + tp: 48 + ep: 48 + dp-attn: true + + # "Max throughput" (2 prefill workers at DEP8 and 1 decode worker at DEP32) + - conc-list: [1024, 2048, 4096, 6144] prefill: num-worker: 2 - # tp, ep, and dp-attn do nothing because they are hardcoded in the following file: - # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh - tp: 1 + tp: 8 ep: 8 dp-attn: true additional-settings: - - "PREFILL_NODES=4" - - "N_ADDITIONAL_FRONTENDS=9" - - "SCRIPT_MODE=1k1k-max-tpt" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/1k1k/max-tpt.yaml + - "CONFIG_FILE=recipes/gb200-fp8/1k1k/max-tpt.yaml" decode: num-worker: 1 - tp: 1 + tp: 32 ep: 32 dp-attn: true + + - isl: 8192 + osl: 1024 + search-space: + # "Low latency" (1 prefill worker at TP8 and 1 decode worker at TP8) + - conc-list: [4, 8, 16] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false additional-settings: - - "DECODE_NODES=8" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/8k1k/low-latency.yaml + - "CONFIG_FILE=recipes/gb200-fp8/8k1k/low-latency.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false - # "Bottom of curve" (1 prefill worker at DEP4 and 4 decode workers at DEP4) - - spec-decoding: "none" - conc-list: [ 2, 4, 8, 16, 64, 128 ] + # "Mid curve" (5 prefill workers at DEP8 and 1 decode worker at DEP32) + - conc-list: [512, 1024, 2048, 6144] prefill: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/8k1k/mid-curve.yaml + - "CONFIG_FILE=recipes/gb200-fp8/8k1k/mid-curve.yaml" + decode: num-worker: 1 - # tp, ep, and dp-attn do nothing because they are hardcoded in the following file: - # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh - tp: 1 - ep: 4 + tp: 32 + ep: 32 + dp-attn: true + + # "Max throughput" (6 prefill workers at DEP8 and 1 decode worker at DEP24) + - conc-list: [2048, 4096, 6144] + prefill: + num-worker: 6 + tp: 8 + ep: 8 dp-attn: true additional-settings: - - "PREFILL_NODES=1" - - "N_ADDITIONAL_FRONTENDS=9" - - "SCRIPT_MODE=1k1k-low-latency" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/8k1k/max-tpt.yaml + - "CONFIG_FILE=recipes/gb200-fp8/8k1k/max-tpt.yaml" decode: - num-worker: 4 - tp: 1 - ep: 4 + num-worker: 1 + tp: 24 + ep: 24 dp-attn: true + +dsr1-fp8-gb300-dynamo-sglang: + image: lmsysorg/sglang:v0.5.8-cu130 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1-fp8 + runner: gb300 + precision: fp8 + framework: dynamo-sglang + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # "Low latency" (1 prefill worker at TP4 and 4 decode workers at TP4) + - conc-list: [4, 8, 16, 32] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false additional-settings: - - "DECODE_NODES=4" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/1k1k/stp/low-latency.yaml + - "CONFIG_FILE=recipes/gb300-fp8/1k1k/stp/low-latency.yaml" + decode: + num-worker: 4 + tp: 4 + ep: 1 + dp-attn: false - # "Middle of curve" (3 prefill workers each at DEP8 and 1 decode worker at DEP48) - - spec-decoding: "none" - conc-list: [ 1024, 2048, 4096 ] + # "Mid curve" (2 prefill workers at DEP8 and 1 decode worker at DEP32) + - conc-list: [1024, 2048, 4096, 6144] prefill: - num-worker: 3 - # tp, ep, and dp-attn do nothing because they are hardcoded in the following file: - # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh - tp: 1 + num-worker: 2 + tp: 8 ep: 8 dp-attn: true additional-settings: - - "PREFILL_NODES=6" - - "N_ADDITIONAL_FRONTENDS=9" - - "SCRIPT_MODE=1k1k-max-tpt" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/1k1k/stp/mid.yaml + - "CONFIG_FILE=recipes/gb300-fp8/1k1k/stp/mid.yaml" decode: num-worker: 1 - tp: 1 - ep: 48 + tp: 32 + ep: 32 + dp-attn: true + + # "Max throughput" (1 prefill worker at DEP8 and 1 decode worker at DEP8) + - conc-list: [4096, 7168, 7680] + prefill: + num-worker: 1 + tp: 8 + ep: 8 dp-attn: true additional-settings: - - "DECODE_NODES=12" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/1k1k/stp/max.yaml + - "CONFIG_FILE=recipes/gb300-fp8/1k1k/stp/max.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true - isl: 8192 osl: 1024 search-space: - # Low latency (1 prefill worker at DEP4 and 1 decode worker at DEP4) - - spec-decoding: "none" - conc-list: [ 4, 8, 16, 32 ] + # "Low latency" (1 prefill worker at TP4 and 1 decode worker at TP4) + - conc-list: [4, 8] prefill: num-worker: 1 - tp: 1 - ep: 4 - dp-attn: true + tp: 4 + ep: 1 + dp-attn: false additional-settings: - - "PREFILL_NODES=1" - - "N_ADDITIONAL_FRONTENDS=8" - - "SCRIPT_MODE=8k1k-low-latency" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/8k1k/stp/low-latency.yaml + - "CONFIG_FILE=recipes/gb300-fp8/8k1k/stp/low-latency.yaml" decode: num-worker: 1 - tp: 1 - ep: 4 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" + tp: 4 + ep: 1 + dp-attn: false - # Middle and top of curve (5 prefill workers each at DEP8 and 1 decode worker at DEP32) - - spec-decoding: "none" - conc-list: [ 512, 1024, 2048, 6144 ] + # "Mid curve" (5 prefill workers at DEP8 and 1 decode worker at DEP32) + - conc-list: [128, 256, 512, 1024] prefill: num-worker: 5 - tp: 1 + tp: 8 ep: 8 dp-attn: true additional-settings: - - "PREFILL_NODES=10" - - "N_ADDITIONAL_FRONTENDS=8" - - "SCRIPT_MODE=8k1k-max-tpt" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/8k1k/stp/mid.yaml + - "CONFIG_FILE=recipes/gb300-fp8/8k1k/stp/mid.yaml" decode: num-worker: 1 - tp: 1 + tp: 32 ep: 32 dp-attn: true + + # "Max throughput" (6 prefill workers at DEP8 and 1 decode worker at DEP24) + - conc-list: [2048, 4096] + prefill: + num-worker: 6 + tp: 8 + ep: 8 + dp-attn: true additional-settings: - - "DECODE_NODES=8" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/8k1k/stp/max.yaml + - "CONFIG_FILE=recipes/gb300-fp8/8k1k/stp/max.yaml" + decode: + num-worker: 1 + tp: 24 + ep: 24 + dp-attn: true dsr1-fp4-gb200-dynamo-sglang: image: "lmsysorg/sglang:v0.5.5.post2" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index aa8ad57f9..526f1ecf8 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -361,3 +361,13 @@ - "8k1k: 14 scenarios (7 MTP, 7 STP) for long context workloads" - "Prefill workers: 1-5P, Decode workers: 1-4D, TP/EP: 8/16/32" pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/617 + +- config-keys: + - dsr1-fp8-gb200-dynamo-sglang + - dsr1-fp8-gb300-dynamo-sglang + description: + - "Update GB200 and GB300 configs for DSR1 FP8 SGLANG STP mode" + - "Image: lmsysorg/sglang:v0.5.8-cu130" + - "Update prefill/decode worker counts, TP/EP parallelism, and dp-attn settings for 1k1k and 8k1k" + pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/XXX + diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 83e94005e..31e7486ab 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -11,6 +11,8 @@ if [[ $FRAMEWORK == "dynamo-sglang" ]]; then export CONFIG_DIR="/mnt/lustre01/artifacts/sglang-configs/1k1k" if [[ $MODEL_PREFIX == "dsr1" ]]; then export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528" + elif [[ $MODEL_PREFIX == "dsr1-fp8" ]]; then + export MODEL_PATH="/mnt/numa1/groups/sa-shared/models/deepseek-r1-0528/" else export MODEL_PATH=$MODEL fi @@ -48,50 +50,6 @@ srun -N 1 -A $SLURM_ACCOUNT -p $SLURM_PARTITION bash -c "enroot import -o $NGINX export ISL="$ISL" export OSL="$OSL" -if [[ $FRAMEWORK == "dynamo-sglang" ]]; then - export IMAGE=$SQUASH_FILE - export SGL_SLURM_JOBS_PATH="dynamo/examples/backends/sglang/slurm_jobs" - bash benchmarks/"${EXP_NAME%%_*}_${PRECISION}_gb200_${FRAMEWORK}.sh" - # Wait for all jobs to complete - echo "Waiting for all jobs to complete..." - while [ -n "$(squeue -u $USER --noheader --format='%i')" ]; do - echo "Jobs still running..." - squeue --steps -u $USER - sleep 30 - done - - # Find the latest log directory that contains the data - cat > collect_latest_results.py <<'PY' -import os, sys -sgl_job_dir, isl, osl, nexp = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]) -for path in sorted([f"{sgl_job_dir}/logs/{name}/vllm_isl_{isl}_osl_{osl}" for name in os.listdir(f"{sgl_job_dir}/logs/") if os.path.isdir(f"{sgl_job_dir}/logs/{name}/vllm_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]: - print(path) -PY - - LOGS_DIR=$(python3 collect_latest_results.py "$SGL_SLURM_JOBS_PATH" $ISL $OSL 1) - if [ -z "$LOGS_DIR" ]; then - echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" - exit 1 - fi - - echo "Found logs directory: $LOGS_DIR" - ls -la $LOGS_DIR - - # Result JSON are contained within the result directory - for result_file in $(find $LOGS_DIR -type f); do - # result_file should directly be isl_ISL_osl_OSL_concurrency_CONC_req_rate_R_gpus_N_ctx_M_gen_N.json - file_name=$(basename $result_file) - if [ -f $result_file ]; then - # Copy the result file to workspace with a unique name - WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${file_name}" - echo "Found result file ${result_file}. Copying them to ${WORKSPACE_RESULT_FILE}" - cp $result_file $WORKSPACE_RESULT_FILE - fi - done - - exit 0 -fi - echo "Cloning srt-slurm repository..." SRT_REPO_DIR="srt-slurm" if [ -d "$SRT_REPO_DIR" ]; then From 2259b91b0d911a5f6fddb9fd8a4116fc52f280d6 Mon Sep 17 00:00:00 2001 From: Jatin Gangani Date: Wed, 4 Feb 2026 16:35:53 -0800 Subject: [PATCH 02/10] update PR # --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 526f1ecf8..e87ba7eae 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -369,5 +369,5 @@ - "Update GB200 and GB300 configs for DSR1 FP8 SGLANG STP mode" - "Image: lmsysorg/sglang:v0.5.8-cu130" - "Update prefill/decode worker counts, TP/EP parallelism, and dp-attn settings for 1k1k and 8k1k" - pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/XXX + pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/635 From 6da98038e29edc4d924e00947710e7adb9fc5af2 Mon Sep 17 00:00:00 2001 From: Jatin Gangani Date: Wed, 4 Feb 2026 17:16:40 -0800 Subject: [PATCH 03/10] update typo --- .github/configs/nvidia-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index cff3839e4..823dfa6a0 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2965,8 +2965,8 @@ dsr1-fp8-gb200-dynamo-sglang: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/8k1k/max-tpt.yaml - - "CONFIG_FILE=recipes/gb200-fp8/8k1k/max-tpt.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/8k1k/max_tpt.yaml + - "CONFIG_FILE=recipes/gb200-fp8/8k1k/max_tpt.yaml" decode: num-worker: 1 tp: 24 From 05a6ba1d2f06b83b13aaaad21fb2091606f1bddd Mon Sep 17 00:00:00 2001 From: Jatin Gangani Date: Wed, 4 Feb 2026 21:49:03 -0800 Subject: [PATCH 04/10] Update model prefix to work with frontend --- .github/configs/nvidia-master.yaml | 4 ++-- runners/launch_gb200-nv.sh | 10 ++++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 823dfa6a0..19943c7cb 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2286,7 +2286,7 @@ dsr1-fp4-gb200-dynamo-trt: dsr1-fp8-gb200-dynamo-trt: image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1-fp8 + model-prefix: dsr1 runner: gb200 precision: fp8 framework: dynamo-trt @@ -2976,7 +2976,7 @@ dsr1-fp8-gb200-dynamo-sglang: dsr1-fp8-gb300-dynamo-sglang: image: lmsysorg/sglang:v0.5.8-cu130 model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1-fp8 + model-prefix: dsr1 runner: gb300 precision: fp8 framework: dynamo-sglang diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 31e7486ab..ac8161d3d 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -9,10 +9,12 @@ set -x # local paths to avoid repeated downloading on the shared GB200 cluster. if [[ $FRAMEWORK == "dynamo-sglang" ]]; then export CONFIG_DIR="/mnt/lustre01/artifacts/sglang-configs/1k1k" - if [[ $MODEL_PREFIX == "dsr1" ]]; then - export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528" - elif [[ $MODEL_PREFIX == "dsr1-fp8" ]]; then + if [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp4" ]]; then + export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528-fp4-v2/" + export SRT_SLURM_MODEL_PREFIX="dsfp4" + if [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then export MODEL_PATH="/mnt/numa1/groups/sa-shared/models/deepseek-r1-0528/" + export SRT_SLURM_MODEL_PREFIX="dsr1-fp8" else export MODEL_PATH=$MODEL fi @@ -96,7 +98,7 @@ srtctl_root: "${SRTCTL_ROOT}" # Model path aliases model_paths: - "${MODEL_PREFIX}": "${MODEL_PATH}" + "${SRT_SLURM_MODEL_PREFIX}": "${MODEL_PATH}" containers: dynamo-trtllm: ${SQUASH_FILE} dynamo-sglang: ${SQUASH_FILE} From 2eeb10b63e892d09c9f188d1bdab971cebf32718 Mon Sep 17 00:00:00 2001 From: Jatin Gangani Date: Wed, 4 Feb 2026 21:52:34 -0800 Subject: [PATCH 05/10] update model prefix --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 19943c7cb..928f77c27 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2864,7 +2864,7 @@ dsr1-fp8-gb200-dynamo-trt: dsr1-fp8-gb200-dynamo-sglang: image: lmsysorg/sglang:v0.5.8-cu130 model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1-fp8 + model-prefix: dsr1 runner: gb200 precision: fp8 framework: dynamo-sglang From f3ab616592c71703b02bfa88b6419ec6dcd2b669 Mon Sep 17 00:00:00 2001 From: Jatin Gangani Date: Wed, 4 Feb 2026 22:13:50 -0800 Subject: [PATCH 06/10] fix if else block --- runners/launch_gb200-nv.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index ac8161d3d..f1b10c120 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -12,7 +12,7 @@ if [[ $FRAMEWORK == "dynamo-sglang" ]]; then if [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp4" ]]; then export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528-fp4-v2/" export SRT_SLURM_MODEL_PREFIX="dsfp4" - if [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then + elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then export MODEL_PATH="/mnt/numa1/groups/sa-shared/models/deepseek-r1-0528/" export SRT_SLURM_MODEL_PREFIX="dsr1-fp8" else @@ -25,7 +25,7 @@ elif [[ $FRAMEWORK == "dynamo-trt" ]]; then elif [[ $MODEL_PREFIX == "dsr1" ]]; then export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528-fp4-v2/" export SERVED_MODEL_NAME="deepseek-r1-fp4" - elif [[ $MODEL_PREFIX == "dsr1-fp8" ]]; then + elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then export MODEL_PATH="/mnt/numa1/groups/sa-shared/models/deepseek-r1-0528/" export SERVED_MODEL_NAME="deepseek-r1-fp8" else From 30750c9774c175958457a58ddc85cdb2956a4438 Mon Sep 17 00:00:00 2001 From: Jatin Gangani Date: Thu, 5 Feb 2026 14:22:40 -0800 Subject: [PATCH 07/10] -Add ultra tpt point for gb200 fp8 1k1k --- .github/configs/nvidia-master.yaml | 16 ++++++++++++++++ runners/launch_gb200-nv.sh | 6 ++++-- runners/launch_gb300-nv.sh | 11 ++++++++--- 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 928f77c27..65a2414d5 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2922,6 +2922,22 @@ dsr1-fp8-gb200-dynamo-sglang: ep: 32 dp-attn: true + # "Ultra throughput" (2 prefill workers at DEP8 and 1 decode worker at DEP8) + - conc-list: [4096] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/1k1k/ultra-tpt.yaml + - "CONFIG_FILE=recipes/gb200-fp8/1k1k/ultra-tpt.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - isl: 8192 osl: 1024 search-space: diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index f1b10c120..aeddd2717 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -11,7 +11,7 @@ if [[ $FRAMEWORK == "dynamo-sglang" ]]; then export CONFIG_DIR="/mnt/lustre01/artifacts/sglang-configs/1k1k" if [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp4" ]]; then export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528-fp4-v2/" - export SRT_SLURM_MODEL_PREFIX="dsfp4" + export SRT_SLURM_MODEL_PREFIX="dsr1" elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then export MODEL_PATH="/mnt/numa1/groups/sa-shared/models/deepseek-r1-0528/" export SRT_SLURM_MODEL_PREFIX="dsr1-fp8" @@ -22,12 +22,14 @@ elif [[ $FRAMEWORK == "dynamo-trt" ]]; then if [[ $MODEL_PREFIX == "gptoss" ]]; then export MODEL_PATH="/mnt/lustre01/models/gpt-oss-120b" export SERVED_MODEL_NAME="gpt-oss-120b" - elif [[ $MODEL_PREFIX == "dsr1" ]]; then + elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp4" ]]; then export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528-fp4-v2/" export SERVED_MODEL_NAME="deepseek-r1-fp4" + export SRT_SLURM_MODEL_PREFIX="dsr1" elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then export MODEL_PATH="/mnt/numa1/groups/sa-shared/models/deepseek-r1-0528/" export SERVED_MODEL_NAME="deepseek-r1-fp8" + export SRT_SLURM_MODEL_PREFIX="dsr1-fp8" else echo "Unsupported model prefix: $MODEL_PREFIX. Supported prefixes are: gptoss or dsr1" exit 1 diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh index 11d2a6b58..0fe24b891 100644 --- a/runners/launch_gb300-nv.sh +++ b/runners/launch_gb300-nv.sh @@ -33,11 +33,16 @@ export SLURM_ACCOUNT="benchmark" export MODEL_PATH=$MODEL -if [[ $MODEL_PREFIX == "dsr1" ]]; then +if [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp4" ]]; then export SERVED_MODEL_NAME="deepseek-r1-fp4" export MODEL_PATH=/raid/shared/models/deepseek-r1-0528-fp4-v2 + export SRT_SLURM_MODEL_PREFIX="dsr1" +elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then + export SERVED_MODEL_NAME="deepseek-r1-fp8" + export MODEL_PATH=/raid/shared/models/deepseek-r1-0528 + export SRT_SLURM_MODEL_PREFIX="dsr1-fp8" else - echo "Unsupported model prefix: $MODEL_PREFIX. Supported prefixes are: dsr1" + echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8" exit 1 fi @@ -64,7 +69,7 @@ network_interface: "" srtctl_root: "${GITHUB_WORKSPACE}/srt-slurm" # Model path aliases model_paths: - "${MODEL_PREFIX}": "${MODEL_PATH}" + "${SRT_SLURM_MODEL_PREFIX}": "${MODEL_PATH}" containers: dynamo-trtllm: ${SQUASH_FILE} use_segment_sbatch_directive: false From e6f9fc307035f868cd6bc00d3fbb9e9ed032bf84 Mon Sep 17 00:00:00 2001 From: Jatin Gangani Date: Thu, 5 Feb 2026 14:55:35 -0800 Subject: [PATCH 08/10] Correct typo --- .github/configs/nvidia-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 65a2414d5..adcb96dce 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2922,10 +2922,10 @@ dsr1-fp8-gb200-dynamo-sglang: ep: 32 dp-attn: true - # "Ultra throughput" (2 prefill workers at DEP8 and 1 decode worker at DEP8) + # "Ultra throughput" (1 prefill workers at DEP8 and 1 decode worker at DEP8) - conc-list: [4096] prefill: - num-worker: 2 + num-worker: 1 tp: 8 ep: 8 dp-attn: true From 3683a8150d77b7c06784c650174233eaa1c9f781 Mon Sep 17 00:00:00 2001 From: Jatin Gangani Date: Fri, 6 Feb 2026 12:41:00 -0800 Subject: [PATCH 09/10] Update GBx00 SGLANG FP8 container --- .github/configs/nvidia-master.yaml | 4 ++-- runners/launch_gb300-nv.sh | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index adcb96dce..0fbfae6a8 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2862,7 +2862,7 @@ dsr1-fp8-gb200-dynamo-trt: dsr1-fp8-gb200-dynamo-sglang: - image: lmsysorg/sglang:v0.5.8-cu130 + image: lmsysorg/sglang:v0.5.8.post1-cu130 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 runner: gb200 @@ -2990,7 +2990,7 @@ dsr1-fp8-gb200-dynamo-sglang: dp-attn: true dsr1-fp8-gb300-dynamo-sglang: - image: lmsysorg/sglang:v0.5.8-cu130 + image: lmsysorg/sglang:v0.5.8.post1-cu130 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 runner: gb300 diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh index 0fe24b891..80df5347e 100644 --- a/runners/launch_gb300-nv.sh +++ b/runners/launch_gb300-nv.sh @@ -72,6 +72,7 @@ model_paths: "${SRT_SLURM_MODEL_PREFIX}": "${MODEL_PATH}" containers: dynamo-trtllm: ${SQUASH_FILE} + dynamo-sglang: ${SQUASH_FILE} use_segment_sbatch_directive: false EOF From 914f7329908431381a65602c29fc26833053ab00 Mon Sep 17 00:00:00 2001 From: Kedar Potdar Date: Sun, 8 Feb 2026 18:33:46 -0800 Subject: [PATCH 10/10] add srt-slurm model prefix --- runners/launch_gb200-nv.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index ee6bb7d69..c4d1ed7af 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -11,7 +11,7 @@ if [[ $FRAMEWORK == "dynamo-sglang" ]]; then export CONFIG_DIR="/mnt/lustre01/artifacts/sglang-configs/1k1k" if [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528" - export SRT_SLURM_MODEL_PREFIX="dsr1" + export SRT_SLURM_MODEL_PREFIX="dsr1-fp8" elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp4" ]]; then export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528-fp4-v2/" export SRT_SLURM_MODEL_PREFIX="dsr1-fp4"