diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml index faa51d369..921909434 100644 --- a/.github/workflows/70b-tmpl.yml +++ b/.github/workflows/70b-tmpl.yml @@ -75,7 +75,7 @@ jobs: runner: b200 image: 'kedarpotdar147/vllm:05' model: 'nvidia/Llama-3.3-70B-Instruct-FP8' - tp-list: '[1, 2, 4, 8]' + tp-list: '[1,2]' timeout: ${{ inputs.timeout }} bmk-mi300x: @@ -127,7 +127,7 @@ jobs: timeout: ${{ inputs.timeout }} collect-results: - needs: [bmk-h100, bmk-h200, bmk-b200, bmk-mi300x, bmk-mi325x, bmk-mi355x] + needs: [bmk-h100,bmk-h200,bmk-b200, bmk-mi300x, bmk-mi325x, bmk-mi355x] if: ${{ always() && !cancelled() }} uses: ./.github/workflows/collect-results.yml secrets: inherit diff --git a/.github/workflows/dsr1-tmpl.yml b/.github/workflows/dsr1-tmpl.yml index 07030c387..59e81c38b 100644 --- a/.github/workflows/dsr1-tmpl.yml +++ b/.github/workflows/dsr1-tmpl.yml @@ -111,7 +111,7 @@ jobs: timeout: ${{ inputs.timeout }} collect-results: - needs: [bmk-h200, bmk-b200, bmk-mi300x, bmk-mi325x, bmk-mi355x] + needs: [ bmk-h200, bmk-b200, bmk-mi300x, bmk-mi325x, bmk-mi355x] if: ${{ always() && !cancelled() }} uses: ./.github/workflows/collect-results.yml secrets: inherit diff --git a/.github/workflows/workflow-scheduler.yml b/.github/workflows/workflow-scheduler.yml index 115452ddf..a0964526b 100644 --- a/.github/workflows/workflow-scheduler.yml +++ b/.github/workflows/workflow-scheduler.yml @@ -35,47 +35,47 @@ jobs: max-model-len: 2048 random-range-ratio: 0.8 - _70b-8k1k: - needs: cleanup - uses: ./.github/workflows/70b-tmpl.yml - secrets: inherit - with: - exp-name: '70b_8k1k' - isl: 8192 - osl: 1024 - max-model-len: 9216 - random-range-ratio: 0.8 + # _70b-8k1k: + # needs: cleanup + # uses: ./.github/workflows/70b-tmpl.yml + # secrets: inherit + # with: + # exp-name: '70b_8k1k' + # isl: 8192 + # osl: 1024 + # max-model-len: 9216 + # random-range-ratio: 0.8 - dsr1-8k1k: - needs: cleanup - uses: ./.github/workflows/dsr1-tmpl.yml - secrets: inherit - with: - exp-name: 'dsr1_8k1k' - isl: 8192 - osl: 1024 - max-model-len: 9216 - random-range-ratio: 0.8 + # dsr1-8k1k: + # needs: cleanup + # uses: ./.github/workflows/dsr1-tmpl.yml + # secrets: inherit + # with: + # exp-name: 'dsr1_8k1k' + # isl: 8192 + # osl: 1024 + # max-model-len: 9216 + # random-range-ratio: 0.8 - _70b-1k8k: - needs: cleanup - uses: ./.github/workflows/70b-tmpl.yml - secrets: inherit - with: - exp-name: '70b_1k8k' - isl: 1024 - osl: 8192 - max-model-len: 9216 - random-range-ratio: 0.8 - timeout: 240 + # _70b-1k8k: + # needs: cleanup + # uses: ./.github/workflows/70b-tmpl.yml + # secrets: inherit + # with: + # exp-name: '70b_1k8k' + # isl: 1024 + # osl: 8192 + # max-model-len: 9216 + # random-range-ratio: 0.8 + # timeout: 240 - dsr1-1k8k: - needs: cleanup - uses: ./.github/workflows/dsr1-tmpl.yml - secrets: inherit - with: - exp-name: 'dsr1_1k8k' - isl: 1024 - osl: 8192 - max-model-len: 9216 - random-range-ratio: 0.8 + # dsr1-1k8k: + # needs: cleanup + # uses: ./.github/workflows/dsr1-tmpl.yml + # secrets: inherit + # with: + # exp-name: 'dsr1_1k8k' + # isl: 1024 + # osl: 8192 + # max-model-len: 9216 + # random-range-ratio: 0.8 diff --git a/benchmarks/70b_b200_slurm.sh b/benchmarks/70b_b200_slurm.sh index f11133cc4..a07a3070e 100644 --- a/benchmarks/70b_b200_slurm.sh +++ b/benchmarks/70b_b200_slurm.sh @@ -20,8 +20,7 @@ hf download $MODEL SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=$(( 8888 + $PORT_OFFSET )) -pip install "git+https://github.com/flashinfer-ai/flashinfer.git@9720182476ede910698f8d783c29b2ec91cec023#egg=flashinfer-python" -pip install --upgrade --no-deps nvidia-nccl-cu12==2.26.2.post1 +pip install flashinfer-python==0.3.0 export TORCH_CUDA_ARCH_LIST="10.0" export VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB='{"2":32,"4":32,"8":8}' diff --git a/benchmarks/dsr1_b200_slurm.sh b/benchmarks/dsr1_b200_slurm.sh index 2aa45be79..28e9e2a32 100644 --- a/benchmarks/dsr1_b200_slurm.sh +++ b/benchmarks/dsr1_b200_slurm.sh @@ -17,18 +17,30 @@ python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT > $SERVER_LOG 2>&1 & set +x +IGNORE_PAT="Ignore import error when loading sglang.srt.models.glm4v_moe: No module named 'transformers.models.glm4v_moe'" + while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then - sleep 5 - tail -n100 $SERVER_LOG - echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME" - exit 1 - fi - if [[ "$line" == *"The server is fired up and ready to roll"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") + printf '%s\n' "$line" + + # Skip the known benign "Ignore import error ..." line + if [[ "$line" == *"$IGNORE_PAT"* ]]; then + continue + fi + + # Keep your original "error" trap for everything else + if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then + sleep 5 + tail -n100 "$SERVER_LOG" + echo "JOB ${SLURM_JOB_ID:-NA} ran on NODE ${SLURMD_NODENAME:-unknown}" + exit 1 + fi + + # Break when server is ready + if [[ "$line" == *"The server is fired up and ready to roll"* ]]; then + break + fi +# Start tail from the beginning so we don't miss early lines +done < <(tail -n +1 -F "$SERVER_LOG") set -x git clone https://github.com/kimbochen/bench_serving.git diff --git a/runners/launch_b200-nv.sh b/runners/launch_b200-nv.sh index 2808758db..8bf679d58 100644 --- a/runners/launch_b200-nv.sh +++ b/runners/launch_b200-nv.sh @@ -5,7 +5,7 @@ export PORT_OFFSET=${USER: -1} MODEL_CODE="${1%%_*}" PARTITION="dgx-b200" -SQUASH_FILE="/raid/image_${MODEL_CODE}_b200-2.sqsh" +SQUASH_FILE="/raid/image_${MODEL_CODE}_b200-0903.sqsh" salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell JOB_ID=$(squeue -u $USER -h -o %A | head -n1)