SemiAnalysisAI · kimbochen · Sep 4, 2025 · Sep 3, 2025 · Sep 3, 2025 · Sep 3, 2025
diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml
@@ -75,7 +75,7 @@ jobs:
       runner: b200
       image: 'kedarpotdar147/vllm:05'
       model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
-      tp-list: '[1, 2, 4, 8]'
+      tp-list: '[1,2]'
       timeout: ${{ inputs.timeout }}
 
   bmk-mi300x:
@@ -127,7 +127,7 @@ jobs:
       timeout: ${{ inputs.timeout }}
 
   collect-results:
-    needs: [bmk-h100, bmk-h200, bmk-b200, bmk-mi300x, bmk-mi325x, bmk-mi355x]
+    needs: [bmk-h100,bmk-h200,bmk-b200, bmk-mi300x, bmk-mi325x, bmk-mi355x]
     if: ${{ always() && !cancelled() }}
     uses: ./.github/workflows/collect-results.yml
     secrets: inherit

diff --git a/.github/workflows/dsr1-tmpl.yml b/.github/workflows/dsr1-tmpl.yml
@@ -111,7 +111,7 @@ jobs:
       timeout: ${{ inputs.timeout }}
 
   collect-results:
-    needs: [bmk-h200, bmk-b200, bmk-mi300x, bmk-mi325x, bmk-mi355x]
+    needs: [ bmk-h200, bmk-b200, bmk-mi300x, bmk-mi325x, bmk-mi355x]
     if: ${{ always() && !cancelled() }}
     uses: ./.github/workflows/collect-results.yml
     secrets: inherit

diff --git a/.github/workflows/workflow-scheduler.yml b/.github/workflows/workflow-scheduler.yml
@@ -35,47 +35,47 @@ jobs:
       max-model-len: 2048
       random-range-ratio: 0.8
 
-  _70b-8k1k:
-    needs: cleanup
-    uses: ./.github/workflows/70b-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: '70b_8k1k'
-      isl: 8192
-      osl: 1024
-      max-model-len: 9216
-      random-range-ratio: 0.8
+  # _70b-8k1k:
+  #   needs: cleanup
+  #   uses: ./.github/workflows/70b-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: '70b_8k1k'
+  #     isl: 8192
+  #     osl: 1024
+  #     max-model-len: 9216
+  #     random-range-ratio: 0.8
 
-  dsr1-8k1k:
-    needs: cleanup
-    uses: ./.github/workflows/dsr1-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: 'dsr1_8k1k'
-      isl: 8192
-      osl: 1024
-      max-model-len: 9216
-      random-range-ratio: 0.8
+  # dsr1-8k1k:
+  #   needs: cleanup
+  #   uses: ./.github/workflows/dsr1-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: 'dsr1_8k1k'
+  #     isl: 8192
+  #     osl: 1024
+  #     max-model-len: 9216
+  #     random-range-ratio: 0.8
 
-  _70b-1k8k:
-    needs: cleanup
-    uses: ./.github/workflows/70b-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: '70b_1k8k'
-      isl: 1024
-      osl: 8192
-      max-model-len: 9216
-      random-range-ratio: 0.8
-      timeout: 240
+  # _70b-1k8k:
+  #   needs: cleanup
+  #   uses: ./.github/workflows/70b-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: '70b_1k8k'
+  #     isl: 1024
+  #     osl: 8192
+  #     max-model-len: 9216
+  #     random-range-ratio: 0.8
+  #     timeout: 240
 
-  dsr1-1k8k:
-    needs: cleanup
-    uses: ./.github/workflows/dsr1-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: 'dsr1_1k8k'
-      isl: 1024
-      osl: 8192
-      max-model-len: 9216
-      random-range-ratio: 0.8
+  # dsr1-1k8k:
+  #   needs: cleanup
+  #   uses: ./.github/workflows/dsr1-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: 'dsr1_1k8k'
+  #     isl: 1024
+  #     osl: 8192
+  #     max-model-len: 9216
+  #     random-range-ratio: 0.8
diff --git a/benchmarks/70b_b200_slurm.sh b/benchmarks/70b_b200_slurm.sh
@@ -20,8 +20,7 @@ hf download $MODEL
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 PORT=$(( 8888 + $PORT_OFFSET ))
 
-pip install "git+https://github.com/flashinfer-ai/flashinfer.git@9720182476ede910698f8d783c29b2ec91cec023#egg=flashinfer-python"
-pip install --upgrade --no-deps nvidia-nccl-cu12==2.26.2.post1
+pip install flashinfer-python==0.3.0
 
 export TORCH_CUDA_ARCH_LIST="10.0"
 export VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB='{"2":32,"4":32,"8":8}'

diff --git a/benchmarks/dsr1_b200_slurm.sh b/benchmarks/dsr1_b200_slurm.sh
@@ -17,18 +17,30 @@ python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT
 > $SERVER_LOG 2>&1 &
 
 set +x
+IGNORE_PAT="Ignore import error when loading sglang.srt.models.glm4v_moe: No module named 'transformers.models.glm4v_moe'"
+
 while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then
-        sleep 5
-        tail -n100 $SERVER_LOG
-        echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME"
-        exit 1
-    fi
-    if [[ "$line" == *"The server is fired up and ready to roll"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
+  printf '%s\n' "$line"
+
+  # Skip the known benign "Ignore import error ..." line
+  if [[ "$line" == *"$IGNORE_PAT"* ]]; then
+    continue
+  fi
+
+  # Keep your original "error" trap for everything else
+  if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then
+    sleep 5
+    tail -n100 "$SERVER_LOG"
+    echo "JOB ${SLURM_JOB_ID:-NA} ran on NODE ${SLURMD_NODENAME:-unknown}"
+    exit 1
+  fi
+
+  # Break when server is ready
+  if [[ "$line" == *"The server is fired up and ready to roll"* ]]; then
+    break
+  fi
+# Start tail from the beginning so we don't miss early lines
+done < <(tail -n +1 -F "$SERVER_LOG")
 
 set -x
 git clone https://github.com/kimbochen/bench_serving.git

diff --git a/runners/launch_b200-nv.sh b/runners/launch_b200-nv.sh
@@ -5,7 +5,7 @@ export PORT_OFFSET=${USER: -1}
 
 MODEL_CODE="${1%%_*}"
 PARTITION="dgx-b200"
-SQUASH_FILE="/raid/image_${MODEL_CODE}_b200-2.sqsh"
+SQUASH_FILE="/raid/image_${MODEL_CODE}_b200-0903.sqsh"
 
 salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell
 JOB_ID=$(squeue -u $USER -h -o %A | head -n1)