diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 02fbb434e..92ec024bc 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -900,7 +900,7 @@ dsr1-fp8-gb200-dynamo-sglang: - "DECODE_NODES=8" dsr1-fp4-gb200-dynamo-sglang: - image: lmsysorg/sglang:v0.5.5.post2 + image: lmsysorg/sglang:dev-cu13 # TODO: what is the right name? # model: deepseek-ai/DeepSeek-R1-0528-fp4-v2 # Models are pre-downloaded to this path on GB200 runner to avoid repeated downloading @@ -1049,6 +1049,56 @@ dsr1-fp4-gb200-dynamo-sglang: dp-attn: true additional-settings: - "DECODE_NODES=8" + - isl: 1024 + osl: 8192 + search-space: + # Low latency (1 prefill node, 2 decode nodes) + - spec-decoding: "none" + conc-list: [ 4, 8, 32, 64, 112 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipies/gb200-fp4/1k8k/low-latency.yaml" + decode: + num-worker: 2 + tp: 4 + ep: 1 + dp-attn: false + + # Mid curve (4 prefill nodes, 12 decode nodes, DEP48) + - spec-decoding: "none" + conc-list: [ 1, 128, 512, 2048, 4096, 8192 ] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipies/gb200-fp4/1k8k/mid-curve.yaml" + decode: + num-worker: 1 + tp: 48 + ep: 48 + dp-attn: true + + # Max throughput (4 prefill nodes, 8 decode nodes, DEP32) + - spec-decoding: "none" + conc-list: [ 1, 128, 512, 2048, 4096, 8192 ] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipies/gb200-fp4/1k8k/max-tpt.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true gptoss-fp4-gb200-dynamo-trt: image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.7.0.post2 diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 8a4ec7474..b96654d2a 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -4,16 +4,15 @@ set -x -echo "Cloning srt-slurm-trtllm repository..." -TRTLLM_REPO_DIR="srt-slurm-trtllm" -if [ -d "$TRTLLM_REPO_DIR" ]; then - echo "Removing existing $TRTLLM_REPO_DIR..." - rm -rf "$TRTLLM_REPO_DIR" +echo "Cloning srt-slurm repository..." +SRT_REPO_DIR="srt-slurm" +if [ -d "$SRT_REPO_DIR" ]; then + echo "Removing existing $SRT_REPO_DIR..." + rm -rf "$SRT_REPO_DIR" fi -git clone https://github.com/jthomson04/srt-slurm-trtllm.git "$TRTLLM_REPO_DIR" -cd "$TRTLLM_REPO_DIR" -git checkout jthomson04/trtllm-support +git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR" +cd "$SRT_REPO_DIR" echo "Installing srtctl..." curl -LsSf https://astral.sh/uv/install.sh | sh @@ -28,30 +27,18 @@ if ! command -v srtctl &> /dev/null; then exit 1 fi -echo "Configs available at: $TRTLLM_REPO_DIR/" +echo "Configs available at: $SRT_REPO_DIR/" # Set up environment variables for SLURM export SLURM_PARTITION="batch" export SLURM_ACCOUNT="benchmark" export SLURM_JOB_NAME="benchmark-dynamo.job" -# For SGLang - we are working on updating the 8k1k configs -# For now we add conditionals to this script to use newer code for the 1k1k configs - -if [[ "$FRAMEWORK" == "dynamo-sglang" ]]; then - SQUASH_FILE="/mnt/lustre01/users/sa-shared/images/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" - srun --partition=$SLURM_PARTITION --exclusive --time=180 bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" - - # Update the IMAGE variable to the squash file - export IMAGE=$SQUASH_FILE -fi - # MODEL_PATH is set in `nvidia-master.yaml` or any other yaml files export MODEL_PATH=$MODEL if [[ $FRAMEWORK == "dynamo-sglang" ]]; then export CONFIG_DIR="/mnt/lustre01/artifacts/sglang-configs/1k1k" - export SGL_SLURM_JOBS_PATH="dynamo/examples/backends/sglang/slurm_jobs" elif [[ $FRAMEWORK == "dynamo-trt" ]]; then if [[ $MODEL_PREFIX == "gptoss" ]]; then export MODEL_PATH="/mnt/lustre01/models/gpt-oss-120b" @@ -59,7 +46,7 @@ elif [[ $FRAMEWORK == "dynamo-trt" ]]; then elif [[ $MODEL_PREFIX == "dsr1" ]]; then export SERVED_MODEL_NAME="deepseek-r1-fp4" else - echo "Unsupported model prefix: $MODEL_PREFIX. Supported prefixes are: gptoss" + echo "Unsupported model prefix: $MODEL_PREFIX. Supported prefixes are: gptoss or dsr1" exit 1 fi fi @@ -67,10 +54,10 @@ fi export ISL="$ISL" export OSL="$OSL" -if [[ "$FRAMEWORK" == "dynamo-trt" ]]; then - # Create srtslurm.yaml for srtctl - echo "Creating srtslurm.yaml configuration..." - cat > srtslurm.yaml < srtslurm.yaml <&1) +else SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) - echo "$SRTCTL_OUTPUT" +fi +echo "$SRTCTL_OUTPUT" - JOB_ID=$(echo "$SRTCTL_OUTPUT" | grep -oP '✅ Job \K[0-9]+' || echo "$SRTCTL_OUTPUT" | grep -oP 'Job \K[0-9]+') +JOB_ID=$(echo "$SRTCTL_OUTPUT" | grep -oP '✅ Job \K[0-9]+' || echo "$SRTCTL_OUTPUT" | grep -oP 'Job \K[0-9]+') - if [ -z "$JOB_ID" ]; then - echo "Error: Failed to extract JOB_ID from srtctl output" - exit 1 - fi +if [ -z "$JOB_ID" ]; then + echo "Error: Failed to extract JOB_ID from srtctl output" + exit 1 +fi - echo "Extracted JOB_ID: $JOB_ID" +echo "Extracted JOB_ID: $JOB_ID" - # Wait for this specific job to complete - echo "Waiting for job $JOB_ID to complete..." - while [ -n "$(squeue -j $JOB_ID --noheader 2>/dev/null)" ]; do - echo "Job $JOB_ID still running..." - squeue -j $JOB_ID - sleep 30 - done - echo "Job $JOB_ID completed!" +# Wait for this specific job to complete +echo "Waiting for job $JOB_ID to complete..." +while [ -n "$(squeue -j $JOB_ID --noheader 2>/dev/null)" ]; do + echo "Job $JOB_ID still running..." + squeue -j $JOB_ID + sleep 30 +done +echo "Job $JOB_ID completed!" - echo "Collecting results..." +echo "Collecting results..." - # Use the JOB_ID to find the logs directory - # srtctl creates logs in outputs/JOB_ID/logs/ - LOGS_DIR="outputs/$JOB_ID/logs" +# Use the JOB_ID to find the logs directory +# srtctl creates logs in outputs/JOB_ID/logs/ +LOGS_DIR="outputs/$JOB_ID/logs" - if [ ! -d "$LOGS_DIR" ]; then - echo "Warning: Logs directory not found at $LOGS_DIR" - exit 1 - fi +if [ ! -d "$LOGS_DIR" ]; then + echo "Warning: Logs directory not found at $LOGS_DIR" + exit 1 +fi - echo "Found logs directory: $LOGS_DIR" +echo "Found logs directory: $LOGS_DIR" - # Find all result subdirectories - RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) +# Find all result subdirectories +RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) - if [ -z "$RESULT_SUBDIRS" ]; then - echo "Warning: No result subdirectories found in $LOGS_DIR" - else - # Process results from all configurations - for result_subdir in $RESULT_SUBDIRS; do - echo "Processing result subdirectory: $result_subdir" +if [ -z "$RESULT_SUBDIRS" ]; then + echo "Warning: No result subdirectories found in $LOGS_DIR" +else + # Process results from all configurations + for result_subdir in $RESULT_SUBDIRS; do + echo "Processing result subdirectory: $result_subdir" - # Extract configuration info from directory name - CONFIG_NAME=$(basename "$result_subdir") + # Extract configuration info from directory name + CONFIG_NAME=$(basename "$result_subdir") - # Find all result JSON files - RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) + # Find all result JSON files + RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) - for result_file in $RESULT_FILES; do - if [ -f "$result_file" ]; then - # Extract metadata from filename - filename=$(basename "$result_file") - concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') - gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') - ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') - gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') + for result_file in $RESULT_FILES; do + if [ -f "$result_file" ]; then + # Extract metadata from filename + filename=$(basename "$result_file") + concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') + gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') + ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') + gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') - echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" + echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" - WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" - cp "$result_file" "$WORKSPACE_RESULT_FILE" + WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" + cp "$result_file" "$WORKSPACE_RESULT_FILE" - echo "Copied result file to: $WORKSPACE_RESULT_FILE" - fi - done + echo "Copied result file to: $WORKSPACE_RESULT_FILE" + fi done - fi - - # Cleanup - echo "Cleaning up..." - deactivate 2>/dev/null || true - rm -rf .venv - echo "Cleanup complete" - -elif [[ "$FRAMEWORK" == "dynamo-sglang" ]]; then - bash benchmarks/"${EXP_NAME%%_*}_${PRECISION}_gb200_${FRAMEWORK}_slurm.sh" - - # Wait for all jobs to complete - echo "Waiting for all jobs to complete..." - while [ -n "$(squeue -u $USER --noheader --format='%i')" ]; do - echo "Jobs still running..." - squeue --steps -u $USER - sleep 30 - done - - # FIXME: The below is bad and is a result of the indirection of the ways in which - # Dynamo jobs are launched. In a follow-up PR, the location of the result file should not - # depend on the runner, it should always be in the same spot in the GH workspace. - - # Find the latest log directory that contains the data - cat > collect_latest_results.py <<'PY' -import os, sys -sgl_job_dir, isl, osl, nexp = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]) -for path in sorted([f"{sgl_job_dir}/logs/{name}/vllm_isl_{isl}_osl_{osl}" for name in os.listdir(f"{sgl_job_dir}/logs/") if os.path.isdir(f"{sgl_job_dir}/logs/{name}/vllm_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]: - print(path) -PY - - LOGS_DIR=$(python3 collect_latest_results.py "$SGL_SLURM_JOBS_PATH" $ISL $OSL 1) - if [ -z "$LOGS_DIR" ]; then - echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" - exit 1 - fi - - echo "Found logs directory: $LOGS_DIR" - ls -la $LOGS_DIR - - # Result JSON are contained within the result directory - for result_file in $(find $LOGS_DIR -type f); do - # result_file should directly be isl_ISL_osl_OSL_concurrency_CONC_req_rate_R_gpus_N_ctx_M_gen_N.json - file_name=$(basename $result_file) - if [ -f $result_file ]; then - # Copy the result file to workspace with a unique name - WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${file_name}" - echo "Found result file ${result_file}. Copying them to ${WORKSPACE_RESULT_FILE}" - cp $result_file $WORKSPACE_RESULT_FILE - fi done fi +# Cleanup +echo "Cleaning up..." +deactivate 2>/dev/null || true +rm -rf .venv +echo "Cleanup complete" + echo "All result files processed" \ No newline at end of file