From 779b7eaa47256000438b2c29aaed7858b7779f05 Mon Sep 17 00:00:00 2001 From: Elnifio Date: Wed, 5 Nov 2025 15:16:45 -0800 Subject: [PATCH 01/11] Bumps DSR1 SGLang code --- runners/launch_gb200-nv.sh | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 3bd9170ff..c16879869 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -8,9 +8,17 @@ export SLURM_PARTITION="batch" export SLURM_ACCOUNT="benchmark" export SLURM_JOB_NAME="benchmark-dynamo.job" +# For SGLang - we are working on updating the 8k1k configs +# For now we add conditionals to this script to use newer code for the 1k1k configs + ### FRAMEWORK_DIFF_IF_STATEMENT #1 - difference in setting up envvars if [[ $FRAMEWORK == "dynamo-sglang" ]]; then - export IMAGE="/mnt/lustre01/artifacts/containers/dynamo-sglang.sqsh" + # Set IMAGE based on ISL/OSL + if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then + export IMAGE="/mnt/lustre01/artifacts/containers/lmsysorg+sglang+v0.5.4.post3-cu129-arm64.sqsh" + else + export IMAGE="/mnt/lustre01/artifacts/containers/dynamo-sglang.sqsh" + fi export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528" export CONFIG_DIR="/mnt/lustre01/artifacts/sglang-configs/1k1k" else @@ -158,12 +166,21 @@ if [[ $FRAMEWORK == "dynamo-trtllm" ]]; then else # if statement at the top - search for "FRAMEWORK_DIFF_IF_STATEMENT #2" # Set up Dynamo repository path DYNAMO_PATH="/mnt/lustre01/users/sa-shared/benchmarks/dynamo" - SGL_SLURM_JOBS_PATH="$DYNAMO_PATH/components/backends/sglang/slurm_jobs" + if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then + SGL_SLURM_JOBS_PATH="$DYNAMO_PATH/examples/backends/sglang/slurm_jobs" + else + SGL_SLURM_JOBS_PATH="$DYNAMO_PATH/components/backends/sglang/slurm_jobs" + fi # Always clone and setup Dynamo echo "Cloning Dynamo repository..." rm -rf "$DYNAMO_PATH" - git clone --branch update-wait-for-model https://github.com/Elnifio/dynamo.git $DYNAMO_PATH + if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then + # TODO: before merge this will be a different branch off of main + git clone --branch ishan/iter https://github.com/ai-dynamo/dynamo.git $DYNAMO_PATH + else + git clone --branch update-wait-for-model https://github.com/Elnifio/dynamo.git $DYNAMO_PATH + fi cd "$DYNAMO_PATH" # Navigate to corresponding directory @@ -179,8 +196,15 @@ else # if statement at the top - search for "FRAMEWORK_DIFF_IF_STATEMENT #2" # Launch jobs based on ISL/OSL if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then - concurrency_list="1024x2048x4096x4608x4864x4992x5120x5376x5632x6144x8192" - bash ./submit_disagg.sh 6 3 12 1 8 $ISL $OSL $concurrency_list inf + top_to_middle_of_curve_concurrency_list="1024x2048x4096" + bottom_of_curve_concurrency_list="2x4x8x16x64x128x256x512" + + # Top to middle of curve (2 prefill workers each at DEP8 and 1 decode worker at DEP32) + bash ./submit_disagg.sh 4 2 8 1 9 $ISL $OSL $top_to_middle_of_curve_concurrency_list inf + + # Bottom of curve (1 prefill worker at DEP4 and 4 decode workers at DEP4) + bash ./submit_disagg.sh 1 1 4 4 9 $ISL $OSL $bottom_of_curve_concurrency_list inf 1p_4d + elif [ "$ISL" = "8192" ] && [ "$OSL" = "1024" ]; then concurrency_list="128x256x384x448x512x576x1024x2048x4096" bash ./submit_disagg.sh 12 6 6 1 8 $ISL $OSL $concurrency_list inf From 9573a78decee950a5f464adde1ea024d22b7c964 Mon Sep 17 00:00:00 2001 From: Elnifio Date: Wed, 5 Nov 2025 16:37:42 -0800 Subject: [PATCH 02/11] temporarily adds debugging messages --- runners/launch_gb200-nv.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index c16879869..8eb5be15c 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -165,6 +165,7 @@ if [[ $FRAMEWORK == "dynamo-trtllm" ]]; then else # if statement at the top - search for "FRAMEWORK_DIFF_IF_STATEMENT #2" # Set up Dynamo repository path + set -x DYNAMO_PATH="/mnt/lustre01/users/sa-shared/benchmarks/dynamo" if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then SGL_SLURM_JOBS_PATH="$DYNAMO_PATH/examples/backends/sglang/slurm_jobs" @@ -212,6 +213,8 @@ else # if statement at the top - search for "FRAMEWORK_DIFF_IF_STATEMENT #2" echo "Unsupported ISL/OSL combination: $ISL/$OSL" exit 1 fi + + set +x fi # Wait for all jobs to complete From 68caa70b2f3bfda90eb1a99021f30b99890633de Mon Sep 17 00:00:00 2001 From: Elnifio Date: Thu, 6 Nov 2025 10:35:10 -0800 Subject: [PATCH 03/11] update how we get the log --- runners/launch_gb200-nv.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 8eb5be15c..6451c6f53 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -288,7 +288,7 @@ if [[ $FRAMEWORK == "dynamo-trtllm" ]]; then else # search for "FRAMEWORK_DIFF_IF_STATEMENT #3" for this if-statement # Find the latest log directory # we do "tail -1" here since only the latest job will yield the result - LOGS_DIR=$(find logs/*/vllm_isl_${ISL}_osl_${OSL} -type d | sort -V | tail -1) + LOGS_DIR=$(find logs/*/vllm_isl_${ISL}_osl_${OSL} -type d | sort -V | tail -2) if [ -z "$LOGS_DIR" ]; then echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" exit 1 From bd0113f0145d1605c7c4514c28e12cd9c0ba63b2 Mon Sep 17 00:00:00 2001 From: Elnifio Date: Thu, 6 Nov 2025 12:35:13 -0800 Subject: [PATCH 04/11] further update how we get the resulting log files --- runners/launch_gb200-nv.sh | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 6451c6f53..7eaa059bd 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -195,8 +195,13 @@ else # if statement at the top - search for "FRAMEWORK_DIFF_IF_STATEMENT #2" export CONFIG_DIR=$CONFIG_DIR export CONTAINER_IMAGE=$IMAGE + # This number is set in the `submit_disagg.sh` script. + RETRIES=1 + # Launch jobs based on ISL/OSL if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then + NUMBER_OF_EXPERIMENTS=2 + top_to_middle_of_curve_concurrency_list="1024x2048x4096" bottom_of_curve_concurrency_list="2x4x8x16x64x128x256x512" @@ -207,6 +212,8 @@ else # if statement at the top - search for "FRAMEWORK_DIFF_IF_STATEMENT #2" bash ./submit_disagg.sh 1 1 4 4 9 $ISL $OSL $bottom_of_curve_concurrency_list inf 1p_4d elif [ "$ISL" = "8192" ] && [ "$OSL" = "1024" ]; then + NUMBER_OF_EXPERIMENTS=1 + concurrency_list="128x256x384x448x512x576x1024x2048x4096" bash ./submit_disagg.sh 12 6 6 1 8 $ISL $OSL $concurrency_list inf else @@ -286,9 +293,18 @@ if [[ $FRAMEWORK == "dynamo-trtllm" ]]; then done else # search for "FRAMEWORK_DIFF_IF_STATEMENT #3" for this if-statement - # Find the latest log directory - # we do "tail -1" here since only the latest job will yield the result - LOGS_DIR=$(find logs/*/vllm_isl_${ISL}_osl_${OSL} -type d | sort -V | tail -2) + # Find the latest log directory that contains the data + cat > collect_latest_results.py <<'PY' +import os, sys +isl, osl, nexp, total_retries = [int(x) for x in sys.argv[1:]] +for chosen_slurm_id in [ + max([int(x) for x in os.listdir("logs/") if int(x) < end_index]) + for end_index in + [min([int(x) for x in os.listdir("logs/")]) + (total_retries+1) * (exp_idx+1) for exp_idx in range(nexp)] +]: + print(f"logs/{chosen_slurm_id}/vllm_isl_{isl}_osl_{osl}") +PY + LOGS_DIR=$(python3 collect_latest_results.py $ISL $OSL $NUMBER_OF_EXPERIMENTS $RETRIES) if [ -z "$LOGS_DIR" ]; then echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" exit 1 From d6f6bd97fa7d77eff2acf4cd090b1a4d71c7edf5 Mon Sep 17 00:00:00 2001 From: Elnifio Date: Thu, 6 Nov 2025 14:51:29 -0800 Subject: [PATCH 05/11] update to the new log name --- runners/launch_gb200-nv.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 7eaa059bd..26d8fe587 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -298,9 +298,9 @@ else # search for "FRAMEWORK_DIFF_IF_STATEMENT #3" for this if-statement import os, sys isl, osl, nexp, total_retries = [int(x) for x in sys.argv[1:]] for chosen_slurm_id in [ - max([int(x) for x in os.listdir("logs/") if int(x) < end_index]) + list(filter(lambda input_log_name: int(input_log_name.split("_")[0]) < end_index, sorted(os.listdir("logs/"), key=lambda log_name: int(log_name.split("_")[0]))))[-1] for end_index in - [min([int(x) for x in os.listdir("logs/")]) + (total_retries+1) * (exp_idx+1) for exp_idx in range(nexp)] + [min([int(log_name.split("_")[0]) for log_name in os.listdir("logs/")]) + (total_retries+1) * (exp_idx+1) for exp_idx in range(nexp)] ]: print(f"logs/{chosen_slurm_id}/vllm_isl_{isl}_osl_{osl}") PY From 9839bcfc2a9eb60f8349fb2c23f072e1c56af313 Mon Sep 17 00:00:00 2001 From: Elnifio Date: Thu, 6 Nov 2025 15:39:41 -0800 Subject: [PATCH 06/11] update --- runners/launch_gb200-nv.sh | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 26d8fe587..68673b640 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -195,9 +195,6 @@ else # if statement at the top - search for "FRAMEWORK_DIFF_IF_STATEMENT #2" export CONFIG_DIR=$CONFIG_DIR export CONTAINER_IMAGE=$IMAGE - # This number is set in the `submit_disagg.sh` script. - RETRIES=1 - # Launch jobs based on ISL/OSL if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then NUMBER_OF_EXPERIMENTS=2 @@ -296,15 +293,11 @@ else # search for "FRAMEWORK_DIFF_IF_STATEMENT #3" for this if-statement # Find the latest log directory that contains the data cat > collect_latest_results.py <<'PY' import os, sys -isl, osl, nexp, total_retries = [int(x) for x in sys.argv[1:]] -for chosen_slurm_id in [ - list(filter(lambda input_log_name: int(input_log_name.split("_")[0]) < end_index, sorted(os.listdir("logs/"), key=lambda log_name: int(log_name.split("_")[0]))))[-1] - for end_index in - [min([int(log_name.split("_")[0]) for log_name in os.listdir("logs/")]) + (total_retries+1) * (exp_idx+1) for exp_idx in range(nexp)] -]: - print(f"logs/{chosen_slurm_id}/vllm_isl_{isl}_osl_{osl}") +isl, osl, nexp = [int(x) for x in sys.argv[1:]] +for path in sorted([f"logs/{name}/vllm_isl_{isl}_osl_{osl}" for name in os.listdir("logs/") if os.path.isdir(f"logs/{name}/vllm_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]: + print(path) PY - LOGS_DIR=$(python3 collect_latest_results.py $ISL $OSL $NUMBER_OF_EXPERIMENTS $RETRIES) + LOGS_DIR=$(python3 collect_latest_results.py $ISL $OSL $NUMBER_OF_EXPERIMENTS) if [ -z "$LOGS_DIR" ]; then echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" exit 1 From 11901185aa59806d8cc9b8a870d7d0e77b32ca55 Mon Sep 17 00:00:00 2001 From: Elnifio Date: Wed, 12 Nov 2025 13:41:33 -0800 Subject: [PATCH 07/11] update the branch --- runners/launch_gb200-nv.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 68673b640..019f4b821 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -178,7 +178,7 @@ else # if statement at the top - search for "FRAMEWORK_DIFF_IF_STATEMENT #2" rm -rf "$DYNAMO_PATH" if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then # TODO: before merge this will be a different branch off of main - git clone --branch ishan/iter https://github.com/ai-dynamo/dynamo.git $DYNAMO_PATH + git clone --branch ishan/sa-1.1-sgl-dsr1-fp8 https://github.com/ai-dynamo/dynamo.git $DYNAMO_PATH else git clone --branch update-wait-for-model https://github.com/Elnifio/dynamo.git $DYNAMO_PATH fi From 67be8967da78061e3520035d92eb50ef79cfea41 Mon Sep 17 00:00:00 2001 From: Elnifio Date: Mon, 24 Nov 2025 10:06:37 -0800 Subject: [PATCH 08/11] update config --- runners/launch_gb200-nv.sh | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 019f4b821..a4b8a8903 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -197,13 +197,17 @@ else # if statement at the top - search for "FRAMEWORK_DIFF_IF_STATEMENT #2" # Launch jobs based on ISL/OSL if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then - NUMBER_OF_EXPERIMENTS=2 + NUMBER_OF_EXPERIMENTS=3 - top_to_middle_of_curve_concurrency_list="1024x2048x4096" + top_of_curve_concurrency_list="4096" + middle_of_curve_concurrency_list="1024x2048x4096" bottom_of_curve_concurrency_list="2x4x8x16x64x128x256x512" - # Top to middle of curve (2 prefill workers each at DEP8 and 1 decode worker at DEP32) - bash ./submit_disagg.sh 4 2 8 1 9 $ISL $OSL $top_to_middle_of_curve_concurrency_list inf + # Top of curve (2 prefill workers each at DEP8 and 1 decode worker at DEP32) + bash ./submit_disagg.sh 4 2 8 1 9 $ISL $OSL $top_of_curve_concurrency_list inf + + # Middle of curve (3 prefill workers each at DEP8 and 1 decode worker at DEP48) + bash ./submit_disagg.sh 6 3 12 1 9 $ISL $OSL $middle_of_curve_concurrency_list inf # Bottom of curve (1 prefill worker at DEP4 and 4 decode workers at DEP4) bash ./submit_disagg.sh 1 1 4 4 9 $ISL $OSL $bottom_of_curve_concurrency_list inf 1p_4d From a71a56101c32296dac2243a9526bf7081c20e73e Mon Sep 17 00:00:00 2001 From: Elnifio Date: Tue, 25 Nov 2025 10:41:00 -0800 Subject: [PATCH 09/11] update concurrencies --- runners/launch_gb200-nv.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index a4b8a8903..d7fd6fe66 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -201,7 +201,7 @@ else # if statement at the top - search for "FRAMEWORK_DIFF_IF_STATEMENT #2" top_of_curve_concurrency_list="4096" middle_of_curve_concurrency_list="1024x2048x4096" - bottom_of_curve_concurrency_list="2x4x8x16x64x128x256x512" + bottom_of_curve_concurrency_list="2x4x8x16x64x128" # Top of curve (2 prefill workers each at DEP8 and 1 decode worker at DEP32) bash ./submit_disagg.sh 4 2 8 1 9 $ISL $OSL $top_of_curve_concurrency_list inf From 90828d3ad002de5b3505069f294dffaf561871e3 Mon Sep 17 00:00:00 2001 From: Elnifio Date: Tue, 25 Nov 2025 11:57:53 -0800 Subject: [PATCH 10/11] updates the experiment order --- runners/launch_gb200-nv.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index d7fd6fe66..7b86ac473 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -206,12 +206,12 @@ else # if statement at the top - search for "FRAMEWORK_DIFF_IF_STATEMENT #2" # Top of curve (2 prefill workers each at DEP8 and 1 decode worker at DEP32) bash ./submit_disagg.sh 4 2 8 1 9 $ISL $OSL $top_of_curve_concurrency_list inf - # Middle of curve (3 prefill workers each at DEP8 and 1 decode worker at DEP48) - bash ./submit_disagg.sh 6 3 12 1 9 $ISL $OSL $middle_of_curve_concurrency_list inf - # Bottom of curve (1 prefill worker at DEP4 and 4 decode workers at DEP4) bash ./submit_disagg.sh 1 1 4 4 9 $ISL $OSL $bottom_of_curve_concurrency_list inf 1p_4d + # Middle of curve (3 prefill workers each at DEP8 and 1 decode worker at DEP48) + bash ./submit_disagg.sh 6 3 12 1 9 $ISL $OSL $middle_of_curve_concurrency_list inf + elif [ "$ISL" = "8192" ] && [ "$OSL" = "1024" ]; then NUMBER_OF_EXPERIMENTS=1 From 184bf999b6eb8b96437d08134e46ae48f3504c3c Mon Sep 17 00:00:00 2001 From: Elnifio Date: Wed, 26 Nov 2025 11:10:10 -0800 Subject: [PATCH 11/11] updates the container link --- runners/launch_gb200-nv.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index ebb66316d..11ee233ea 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -15,7 +15,7 @@ export SLURM_JOB_NAME="benchmark-dynamo.job" if [[ $FRAMEWORK == "dynamo-sglang" ]]; then # Set IMAGE based on ISL/OSL if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then - export IMAGE="/mnt/lustre01/artifacts/containers/lmsysorg+sglang+v0.5.4.post3-cu129-arm64.sqsh" + export IMAGE="/mnt/lustre01/artifacts/containers/lmsysorg+sglang+v0.5.5.post2.sqsh" else export IMAGE="/mnt/lustre01/artifacts/containers/dynamo-sglang.sqsh" fi