diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 7e975fdba..ec3a67103 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7722,3 +7722,38 @@ dsv4-fp4-gb200-dynamo-vllm: tp: 8 ep: 8 dp-attn: true + +dsv4-fp4-gb300-dynamo-sglang: + # _arm64 variant: GH runner pod doing `enroot import` is amd64, but + # gb300-cw compute nodes are aarch64 (Grace). Without the explicit + # arm64 tag the registry serves the amd64 manifest, which fails to + # exec on the compute side. + image: lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: gb300-cw + precision: fp4 + framework: dynamo-sglang + multinode: true + disagg: true + # Uses the sa-bench recipe copied exactly from NVIDIA/srt-slurm: + # recipes/dsv4-pro/sglang/gb200-fp4/1k1k/disagg/stp/disagg-1p3d-tp8.yaml + # at commit 9d75f82acec163594658a440f39dd7f1bd35bd16. + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # 1 prefill worker and 3 decode workers, each TP=8. + - conc-list: [32, 64, 128, 256, 512, 1024] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/dsv4-pro/sglang/gb200-fp4/1k1k/disagg/stp/disagg-1p3d-tp8.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 1 + dp-attn: false diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml index 60f3299cf..f574c629c 100644 --- a/.github/configs/runners.yaml +++ b/.github/configs/runners.yaml @@ -139,3 +139,8 @@ gb300: - 'gb300-nv_0' - 'gb300-nv_1' - 'gb300-nv_2' +gb300-cw: +- 'gb300-cw_0' +- 'gb300-cw_1' +- 'gb300-cw_2' +- 'gb300-cw_3' diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index 75036a986..45330b378 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -171,6 +171,9 @@ jobs: set -x # Export RESULT_FILENAME early so it's available for artifact uploads even if cancelled echo "RESULT_FILENAME=${RESULT_FILENAME}" >> $GITHUB_ENV + rm -f multinode_server_logs.tar.gz + rm -rf LOGS + rm -f ${RESULT_FILENAME}_*.json agg_${RESULT_FILENAME}_*.json export ${{ join(fromJson(inputs.prefill-additional-settings), ' ') }} ${{ join(fromJson(inputs.decode-additional-settings), ' ') }} export IS_MULTINODE=true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/gb200-fp4/1k1k/disagg/stp/disagg-1p1d-tp8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/gb200-fp4/1k1k/disagg/stp/disagg-1p1d-tp8.yaml new file mode 100644 index 000000000..158f5d299 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/gb200-fp4/1k1k/disagg/stp/disagg-1p1d-tp8.yaml @@ -0,0 +1,80 @@ +# DeepSeek-V4-Pro disaggregated on GB200 (1P3D, TP=8, MXFP4) +# +# AIME 2025 (aime25): all 30 problems, full concurrency + +name: "dsv4-pro-gb200-1k1k-disagg-1p1d-tp8-aime" + +dynamo: + hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d" + +frontend: + type: dynamo + nginx_container: nginx + +model: + path: "dspro" + container: "dspro-0426-nixl" + precision: "mxfp4" + +resources: + gpu_type: "gb200" + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_node: 4 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +backend: + type: sglang + + prefill_environment: + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + + decode_environment: + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + + sglang_config: + prefill: + disaggregation-bootstrap-port: 30001 + served-model-name: "dspro" + trust-remote-code: true + tensor-parallel-size: 8 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 8192 + disable-flashinfer-autotune: true + max-running-requests: 1024 + cuda-graph-max-bs: 2048 + mem-fraction-static: 0.85 + + decode: + served-model-name: "dspro" + disaggregation-bootstrap-port: 30001 + trust-remote-code: true + tensor-parallel-size: 8 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 8192 + disable-flashinfer-autotune: true + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + mem-fraction-static: 0.85 + +benchmark: + type: "aime" + aime_dataset: "aime25" + num_threads: 30 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/gb200-fp4/1k1k/disagg/stp/disagg-1p3d-tp8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/gb200-fp4/1k1k/disagg/stp/disagg-1p3d-tp8.yaml new file mode 100644 index 000000000..9e81c512a --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/gb200-fp4/1k1k/disagg/stp/disagg-1p3d-tp8.yaml @@ -0,0 +1,83 @@ +# DeepSeek-V4-Pro disaggregated on GB200 (1P3D, TP=8, MXFP4) +# +# Some basic rate matching +# TODO: no optimizations have been applied yet + +dynamo: + hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d" + +frontend: + type: dynamo + nginx_container: nginx + +model: + path: "dspro" + container: "dspro-0426" + precision: "mxfp4" + +resources: + gpu_type: "gb200" + prefill_nodes: 2 + decode_nodes: 6 + prefill_workers: 1 + decode_workers: 3 + gpus_per_node: 4 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +backend: + type: sglang + + prefill_environment: + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + + decode_environment: + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + + sglang_config: + prefill: + disaggregation-bootstrap-port: 30001 + served-model-name: "dspro" + trust-remote-code: true + tensor-parallel-size: 8 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 8192 + disable-flashinfer-autotune: true + max-running-requests: 1024 + cuda-graph-max-bs: 2048 + mem-fraction-static: 0.85 + + decode: + served-model-name: "dspro" + disaggregation-bootstrap-port: 30001 + trust-remote-code: true + tensor-parallel-size: 8 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 8192 + disable-flashinfer-autotune: true + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + mem-fraction-static: 0.85 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "32x64x128x256x512x1024" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/gb200-fp4/8k1k/disagg/stp/disagg-1p1d-tp8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/gb200-fp4/8k1k/disagg/stp/disagg-1p1d-tp8.yaml new file mode 100644 index 000000000..a8e11de01 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/gb200-fp4/8k1k/disagg/stp/disagg-1p1d-tp8.yaml @@ -0,0 +1,94 @@ +# DeepSeek-V4-Pro disaggregated on GB200 (1P1D, TP=8, MXFP4) — 8k1k newtp + dspro-0426. +# WIP + +name: "gb200-mxfp4-8k1k-disagg-newtp" + +dynamo: + hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d" + +frontend: + type: dynamo + nginx_container: nginx + +model: + path: "dspro" + container: "dspro-0426" + precision: "mxfp4" + +resources: + gpu_type: "gb200" + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_node: 4 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +backend: + type: sglang + + prefill_environment: + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + + decode_environment: + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + + sglang_config: + prefill: + disaggregation-bootstrap-port: 30001 + served-model-name: "dspro" + trust-remote-code: true + tensor-parallel-size: 8 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 8192 + disable-flashinfer-autotune: true + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + swa-full-tokens-ratio: 0.1 + + decode: + served-model-name: "dspro" + disaggregation-bootstrap-port: 30001 + trust-remote-code: true + tensor-parallel-size: 8 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 8192 + disable-flashinfer-autotune: true + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + swa-full-tokens-ratio: 0.1 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "32x64x128x256x512" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 2bd14e776..4969573e0 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1877,7 +1877,7 @@ - "Image pinned to lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3" - "DP-attention path enables SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 for better SWA eviction behavior" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1185 - + - config-keys: - dsv4-fp4-b200-sglang description: @@ -1985,3 +1985,12 @@ - "Topology: 1 prefill DEP8 worker and 4 decode TP8 workers with dedicated NATS/etcd" - "Mirrors the historical 1P4D DEP8/TP8 offload point from srt-slurm aflowers/vllm-gb200-v0.20.0" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1218 + +- config-keys: + - dsv4-fp4-gb300-dynamo-sglang + description: + - "Add DeepSeek-V4-Pro FP4 GB300 Dynamo SGLang disaggregated multinode configuration" + - "Image: lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64 (gb300-cw compute nodes are aarch64)" + - "Topology: 1 prefill worker + 3 decode workers, TP=8, MXFP4 MoE kernels, NIXL KV transfer" + - "Recipes copied exactly from NVIDIA/srt-slurm recipes/dsv4-pro/sglang/gb200-fp4 at commit 9d75f82acec163594658a440f39dd7f1bd35bd16" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1169 diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh new file mode 100755 index 000000000..70c03987b --- /dev/null +++ b/runners/launch_gb300-cw.sh @@ -0,0 +1,303 @@ +#!/usr/bin/bash + +# Launches multi-node Dynamo + SGLang benchmarks on the gb300-cw +# (CoreWeave) cluster. Adapted from the dynamo-vllm sibling launcher in +# the dsv4-fp4-gb300-dynamo-vllm-disagg branch (PR #1150). The SGLang +# recipes are copied exactly from the pinned srt-slurm commit below. + +set -x + +archive_server_logs() { + if [ -n "${LOGS_DIR:-}" ] && [ -d "$LOGS_DIR" ]; then + local workspace="${GITHUB_WORKSPACE:-$(pwd)}" + echo "Archiving server logs from $LOGS_DIR" + rm -rf "$workspace/LOGS" + cp -r "$LOGS_DIR" "$workspace/LOGS" || true + tar czf "$workspace/multinode_server_logs.tar.gz" -C "$LOGS_DIR" . || true + fi +} + +trap 'status=$?; archive_server_logs; exit $status' EXIT +trap 'echo "Received termination signal"; exit 143' INT TERM + +if [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then + # Weights staged on the shared VAST mount; no compute-node-local + # NVMe on cw. The exact upstream recipes refer to this model as + # `dspro`. + export MODEL_PATH="/mnt/vast/models/dsv4/" +else + echo "Unsupported model prefix/precision/framework combination on gb300-cw: $MODEL_PREFIX/$PRECISION/$FRAMEWORK. Currently supported: dsv4/fp4/dynamo-sglang" + exit 1 +fi + +# CoreWeave cluster has a single `all` partition; account `cw-sup` is +# what `sacctmgr show assoc user=$USER` returns there. `benchmark` +# (inherited from gb200-nv) does not exist on cw. +export SLURM_PARTITION="all" +export SLURM_ACCOUNT="cw-sup" + +# Pyxis/enroot's NVIDIA prestart hook reads these from the runtime env +# to decide which host driver libraries (libcuda.so.1, libnvidia-*.so) +# to mount into the container. cw doesn't set them by default — without +# them the container has no libcuda and CUDA init fails. SLURM's default +# --export=ALL propagates these from this shell through sbatch+srun +# into the enroot environment. +export NVIDIA_VISIBLE_DEVICES=all +export NVIDIA_DRIVER_CAPABILITIES=compute,utility + +NGINX_IMAGE="nginx:1.27.4" +SRT_SLURM_RECIPES_COMMIT="9d75f82acec163594658a440f39dd7f1bd35bd16" + +# Squash files live alongside models on /mnt/vast (shared across nodes). +# `squash_dupe` instead of `squash` to use '_'-separated names: srtctl / +# pyxis rejects '+' in image paths with "Invalid image format", and the +# old /mnt/vast/squash dir contains '+'-separated files from prior runs. +SQUASH_DIR="/mnt/vast/squash_dupe" +mkdir -p "$SQUASH_DIR" +SQUASH_FILE="$SQUASH_DIR/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh" + +enroot import -o $SQUASH_FILE docker://$IMAGE +enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE + +export EVAL_ONLY="${EVAL_ONLY:-false}" + +export ISL="$ISL" +export OSL="$OSL" + +# srt-slurm path requires a CONFIG_FILE pointing to a recipe YAML. +# Without it, srtctl apply scans every YAML in the repo and submits +# hundreds of jobs. +if [[ -z "$CONFIG_FILE" ]]; then + echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2 + echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2 + exit 1 +fi + +echo "Cloning srt-slurm repository..." +SRT_REPO_DIR="srt-slurm" +if [ -d "$SRT_REPO_DIR" ]; then + echo "Removing existing $SRT_REPO_DIR..." + rm -rf "$SRT_REPO_DIR" +fi + +git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" +cd "$SRT_REPO_DIR" +git checkout "$SRT_SLURM_RECIPES_COMMIT" + +# Overlay the local copy of the exact pinned recipes. This keeps the PR +# self-contained while preserving byte-for-byte recipe content from +# NVIDIA/srt-slurm at $SRT_SLURM_RECIPES_COMMIT. +cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/gb200-fp4" recipes/dsv4-pro/sglang/gb200-fp4 + +echo "Installing srtctl..." +# CRITICAL — uv install location. +# Runner pod is x86 but compute nodes are aarch64, and /mnt/home is +# shared NFS across both. srtctl's slurm template (job_script_minimal.j2) +# does `if ! command -v uv` and skips its own ARM64 install when uv is +# already on PATH; on compute nodes $HOME/.local/bin is on PATH by +# default, so a stray x86 binary at $HOME/.local/bin/uv from this +# runner shadows the template's install and crashes the orchestrator +# with `cannot execute binary file: Exec format error`. Install to a +# runner-pod-local /tmp path (tmpfs, not NFS) and scrub any stale x86 +# uv left in the shared path by prior runs. +rm -f "$HOME/.local/bin/uv" "$HOME/.local/bin/uvx" +export XDG_BIN_HOME="/tmp/uv-runner-${RUNNER_NAME:-default}/bin" +mkdir -p "$XDG_BIN_HOME" +curl -LsSf https://astral.sh/uv/install.sh | env INSTALLER_NO_MODIFY_PATH=1 sh +export PATH="$XDG_BIN_HOME:$PATH" + +if [ ! -x "$XDG_BIN_HOME/uv" ]; then + echo "ERROR: uv not at $XDG_BIN_HOME/uv after install — install script may not honor XDG_BIN_HOME on this version. Aborting before x86 uv leaks onto NFS." >&2 + exit 1 +fi +if [ -e "$HOME/.local/bin/uv" ]; then + echo "ERROR: uv install leaked to shared $HOME/.local/bin/uv. Remove it and re-run." >&2 + exit 1 +fi + +uv venv +source .venv/bin/activate +uv pip install -e . + +if ! command -v srtctl &> /dev/null; then + echo "Error: Failed to install srtctl" + exit 1 +fi + +echo "Configs available at: $SRT_REPO_DIR/" + +SRTCTL_ROOT="${GITHUB_WORKSPACE}/srt-slurm" +echo "Creating srtslurm.yaml configuration..." +cat > srtslurm.yaml < "$TMP_CONFIG_FILE" + mv "$TMP_CONFIG_FILE" "$CONFIG_FILE" +fi + +# CoreWeave needs explicit CPU and memory allocation for srt-slurm SGLang +# jobs. Apply this only to the cloned runtime copy so the committed +# NVIDIA recipe files stay byte-identical to the pinned source. +if ! grep -q '^sbatch_directives:' "$CONFIG_FILE"; then + { + echo "" + echo "# CoreWeave runtime-only Slurm resource directives." + echo "sbatch_directives:" + echo ' cpus-per-task: "144"' + echo ' mem: "0"' + } >> "$CONFIG_FILE" +fi + +SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "gb300,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) +echo "$SRTCTL_OUTPUT" + +JOB_ID=$(echo "$SRTCTL_OUTPUT" | grep -oP '✅ Job \K[0-9]+' || echo "$SRTCTL_OUTPUT" | grep -oP 'Job \K[0-9]+') + +set +x + +if [ -z "$JOB_ID" ]; then + echo "Error: Failed to extract JOB_ID from srtctl output" + exit 1 +fi + +echo "Extracted JOB_ID: $JOB_ID" + +LOGS_DIR="outputs/$JOB_ID/logs" +LOG_FILE="$LOGS_DIR/sweep_${JOB_ID}.log" + +while ! ls "$LOG_FILE" &>/dev/null; do + if ! squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; then + echo "ERROR: Job $JOB_ID failed before creating log file" + scontrol show job "$JOB_ID" + exit 1 + fi + echo "Waiting for JOB_ID $JOB_ID to begin and $LOG_FILE to appear..." + sleep 5 +done + +( + while squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; do + sleep 10 + done +) & +POLL_PID=$! + +echo "Tailing LOG_FILE: $LOG_FILE" + +tail -F -s 2 -n+1 "$LOG_FILE" --pid=$POLL_PID 2>/dev/null + +wait $POLL_PID + +set -x + +echo "Job $JOB_ID completed!" +echo "Collecting results..." + +if [ -d "$LOGS_DIR" ]; then + echo "Found logs directory: $LOGS_DIR" + archive_server_logs +else + echo "Warning: Logs directory not found at $LOGS_DIR" +fi + +if [[ "${EVAL_ONLY:-false}" != "true" ]]; then + if [ ! -d "$LOGS_DIR" ]; then + exit 1 + fi + + RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) + + if [ -z "$RESULT_SUBDIRS" ]; then + echo "Warning: No result subdirectories found in $LOGS_DIR" + else + for result_subdir in $RESULT_SUBDIRS; do + echo "Processing result subdirectory: $result_subdir" + + CONFIG_NAME=$(basename "$result_subdir") + + RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) + + for result_file in $RESULT_FILES; do + if [ -f "$result_file" ]; then + filename=$(basename "$result_file") + concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') + gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') + ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') + gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') + + echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" + + WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" + cp "$result_file" "$WORKSPACE_RESULT_FILE" + + echo "Copied result file to: $WORKSPACE_RESULT_FILE" + fi + done + done + fi + + echo "All result files processed" +else + echo "EVAL_ONLY=true: Skipping benchmark result collection" +fi + +if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then + EVAL_DIR="$LOGS_DIR/eval_results" + if [ -d "$EVAL_DIR" ]; then + echo "Extracting eval results from $EVAL_DIR" + shopt -s nullglob + for eval_file in "$EVAL_DIR"/*; do + [ -f "$eval_file" ] || continue + cp "$eval_file" "$GITHUB_WORKSPACE/" + echo "Copied eval artifact: $(basename "$eval_file")" + done + shopt -u nullglob + else + echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR" + fi +fi diff --git a/utils/test_process_result.py b/utils/test_process_result.py index 2a6389a78..d33f1c1a5 100644 --- a/utils/test_process_result.py +++ b/utils/test_process_result.py @@ -47,6 +47,7 @@ def base_env_vars(): "OSL": "1024", "DISAGG": "false", "MODEL_PREFIX": "dsr1", + "IMAGE": "test-image", }