SemiAnalysisAI · kimbochen · Sep 3, 2025 · Sep 2, 2025 · Sep 2, 2025 · Sep 3, 2025
diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml
@@ -73,7 +73,7 @@ jobs:
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
       runner: b200
-      image: 'kedarpotdar147/vllm0.1:latest'
+      image: 'kedarpotdar147/vllm:05'
       model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
       tp-list: '[1, 2, 4, 8]'
       timeout: ${{ inputs.timeout }}

diff --git a/.github/workflows/dsr1-tmpl.yml b/.github/workflows/dsr1-tmpl.yml
@@ -57,7 +57,7 @@ jobs:
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
       runner: b200
-      image: 'lmsysorg/sglang:v0.4.10.post1-cu128-b200'
+      image: 'llmsysorg/sglang:v0.5.0rc1-cu128-b200'
       model: 'deepseek-ai/DeepSeek-R1-0528'
       tp-list: '[8]'
       timeout: ${{ inputs.timeout }}

diff --git a/benchmarks/70b_b200_slurm.sh b/benchmarks/70b_b200_slurm.sh
@@ -21,12 +21,27 @@ hf download $MODEL
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 PORT=$(( 8888 + $PORT_OFFSET ))
 
+#nccl update
+pip uninstall -y nvidia-nccl-cu12
+pip install nvidia-nccl-cu12==2.26.2.post1
+
+pip uninstall -y flashinfer-python
+git clone --recursive https://github.com/flashinfer-ai/flashinfer.git
+git checkout 9720182476ede910698f8d783c29b2ec91cec023
+cd flashinfer
+pip install .
+
+export VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB='{"2":32,"4":32,"8":8}'
+
+FUSION_FLAG='{"pass_config":{"enable_fi_allreduce_fusion":true,"enable_attn_fusion":true,"enable_noop":true},"custom_ops":["+quant_fp8","+rms_norm"],"cudagraph_mode":"FULL_DECODE_ONLY","splitting_ops":[]}'
+
+
 export TORCH_CUDA_ARCH_LIST="10.0"
 vllm serve $MODEL --host 0.0.0.0 --port $PORT \
---trust-remote-code --quantization modelopt --kv-cache-dtype fp8 --gpu-memory-utilization 0.9 \
---pipeline-parallel-size 1 --tensor-parallel-size $TP --max-num-seqs $CONC --max-num-batched-tokens 8192 --max-model-len $MAX_MODEL_LEN \
+--trust-remote-code --kv-cache-dtype fp8 --gpu-memory-utilization 0.9 \
+--pipeline-parallel-size 1 --tensor-parallel-size $TP --max-num-seqs $CONC --max-num-batched-tokens 8192 --max-num-seqs 512 --max-model-len $MAX_MODEL_LEN \
 --enable-chunked-prefill --async-scheduling --no-enable-prefix-caching \
---compilation-config '{"pass_config": {"enable_fi_allreduce_fusion": true}, "custom_ops": ["+rms_norm"], "level": 3}' \
+--compilation-config ${FUSION_FLAG} \
 --disable-log-requests > $SERVER_LOG 2>&1 &
 
 set +x
@@ -54,4 +69,4 @@ python3 bench_serving/benchmark_serving.py \
 --request-rate inf --ignore-eos \
 --save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
 --result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json
+--result-filename $RESULT_FILENAME.json
diff --git a/benchmarks/dsr1_b200_slurm.sh b/benchmarks/dsr1_b200_slurm.sh
@@ -7,12 +7,13 @@ SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 
 set -x
 PORT=$(( 8888 + $PORT_OFFSET ))
-export SGL_ENABLE_JIT_DEEPGEMM=0
+export SGL_ENABLE_JIT_DEEPGEMM=false 
+export SGLANG_ENABLE_FLASHINFER_GEMM=true
 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \
 --tensor-parallel-size=$TP --data-parallel-size=1 \
---cuda-graph-max-bs 256 --max-running-requests 512 --mem-fraction-static 0.89 \
+--cuda-graph-max-bs 128 --max-running-requests 128 --mem-fraction-static 0.82 --kv-cache-dtype fp8_e4m3 \
 --chunked-prefill-size 32768 --max-prefill-tokens 32768 \
---disable-radix-cache --attention-backend trtllm_mla --disable-shared-experts-fusion --enable-flashinfer-trtllm-moe \
+--disable-radix-cache --attention-backend trtllm_mla --enable-flashinfer-trtllm-moe --stream-interval 1 \
 > $SERVER_LOG 2>&1 &
 
 set +x

diff --git a/runners/launch_b200-nv.sh b/runners/launch_b200-nv.sh
@@ -5,7 +5,7 @@ export PORT_OFFSET=${USER: -1}
 
 MODEL_CODE="${1%%_*}"
 PARTITION="dgx-b200"
-SQUASH_FILE="/raid/image_${MODEL_CODE}_b200.sqsh"
+SQUASH_FILE="/raid/image_${MODEL_CODE}_b200-2.sqsh"
 
 salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell
 JOB_ID=$(squeue -u $USER -h -o %A | head -n1)