diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml index 3ed4e1b2a..ea3444fc3 100644 --- a/.github/workflows/70b-tmpl.yml +++ b/.github/workflows/70b-tmpl.yml @@ -73,7 +73,7 @@ jobs: max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} runner: b200 - image: 'kedarpotdar147/vllm0.1:latest' + image: 'kedarpotdar147/vllm:05' model: 'nvidia/Llama-3.3-70B-Instruct-FP8' tp-list: '[1, 2, 4, 8]' timeout: ${{ inputs.timeout }} diff --git a/.github/workflows/dsr1-tmpl.yml b/.github/workflows/dsr1-tmpl.yml index ced5d1f98..d4731aa6d 100644 --- a/.github/workflows/dsr1-tmpl.yml +++ b/.github/workflows/dsr1-tmpl.yml @@ -57,7 +57,7 @@ jobs: max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} runner: b200 - image: 'lmsysorg/sglang:v0.4.10.post1-cu128-b200' + image: 'llmsysorg/sglang:v0.5.0rc1-cu128-b200' model: 'deepseek-ai/DeepSeek-R1-0528' tp-list: '[8]' timeout: ${{ inputs.timeout }} diff --git a/benchmarks/70b_b200_slurm.sh b/benchmarks/70b_b200_slurm.sh index fd444abab..d9c9bcacc 100644 --- a/benchmarks/70b_b200_slurm.sh +++ b/benchmarks/70b_b200_slurm.sh @@ -21,12 +21,27 @@ hf download $MODEL SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=$(( 8888 + $PORT_OFFSET )) +#nccl update +pip uninstall -y nvidia-nccl-cu12 +pip install nvidia-nccl-cu12==2.26.2.post1 + +pip uninstall -y flashinfer-python +git clone --recursive https://github.com/flashinfer-ai/flashinfer.git +git checkout 9720182476ede910698f8d783c29b2ec91cec023 +cd flashinfer +pip install . + +export VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB='{"2":32,"4":32,"8":8}' + +FUSION_FLAG='{"pass_config":{"enable_fi_allreduce_fusion":true,"enable_attn_fusion":true,"enable_noop":true},"custom_ops":["+quant_fp8","+rms_norm"],"cudagraph_mode":"FULL_DECODE_ONLY","splitting_ops":[]}' + + export TORCH_CUDA_ARCH_LIST="10.0" vllm serve $MODEL --host 0.0.0.0 --port $PORT \ ---trust-remote-code --quantization modelopt --kv-cache-dtype fp8 --gpu-memory-utilization 0.9 \ ---pipeline-parallel-size 1 --tensor-parallel-size $TP --max-num-seqs $CONC --max-num-batched-tokens 8192 --max-model-len $MAX_MODEL_LEN \ +--trust-remote-code --kv-cache-dtype fp8 --gpu-memory-utilization 0.9 \ +--pipeline-parallel-size 1 --tensor-parallel-size $TP --max-num-seqs $CONC --max-num-batched-tokens 8192 --max-num-seqs 512 --max-model-len $MAX_MODEL_LEN \ --enable-chunked-prefill --async-scheduling --no-enable-prefix-caching \ ---compilation-config '{"pass_config": {"enable_fi_allreduce_fusion": true}, "custom_ops": ["+rms_norm"], "level": 3}' \ +--compilation-config ${FUSION_FLAG} \ --disable-log-requests > $SERVER_LOG 2>&1 & set +x @@ -54,4 +69,4 @@ python3 bench_serving/benchmark_serving.py \ --request-rate inf --ignore-eos \ --save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ --result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json +--result-filename $RESULT_FILENAME.json \ No newline at end of file diff --git a/benchmarks/dsr1_b200_slurm.sh b/benchmarks/dsr1_b200_slurm.sh index e18f536dd..2aa45be79 100644 --- a/benchmarks/dsr1_b200_slurm.sh +++ b/benchmarks/dsr1_b200_slurm.sh @@ -7,12 +7,13 @@ SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) set -x PORT=$(( 8888 + $PORT_OFFSET )) -export SGL_ENABLE_JIT_DEEPGEMM=0 +export SGL_ENABLE_JIT_DEEPGEMM=false +export SGLANG_ENABLE_FLASHINFER_GEMM=true python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \ --tensor-parallel-size=$TP --data-parallel-size=1 \ ---cuda-graph-max-bs 256 --max-running-requests 512 --mem-fraction-static 0.89 \ +--cuda-graph-max-bs 128 --max-running-requests 128 --mem-fraction-static 0.82 --kv-cache-dtype fp8_e4m3 \ --chunked-prefill-size 32768 --max-prefill-tokens 32768 \ ---disable-radix-cache --attention-backend trtllm_mla --disable-shared-experts-fusion --enable-flashinfer-trtllm-moe \ +--disable-radix-cache --attention-backend trtllm_mla --enable-flashinfer-trtllm-moe --stream-interval 1 \ > $SERVER_LOG 2>&1 & set +x diff --git a/runners/launch_b200-nv.sh b/runners/launch_b200-nv.sh index fd47bf58d..2808758db 100644 --- a/runners/launch_b200-nv.sh +++ b/runners/launch_b200-nv.sh @@ -5,7 +5,7 @@ export PORT_OFFSET=${USER: -1} MODEL_CODE="${1%%_*}" PARTITION="dgx-b200" -SQUASH_FILE="/raid/image_${MODEL_CODE}_b200.sqsh" +SQUASH_FILE="/raid/image_${MODEL_CODE}_b200-2.sqsh" salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell JOB_ID=$(squeue -u $USER -h -o %A | head -n1)