From 4f9ee5e6c2ddf4e05a54362f5b1250a72d312dfa Mon Sep 17 00:00:00 2001 From: Kedar Potdar Date: Wed, 3 Sep 2025 09:21:18 -0700 Subject: [PATCH 1/6] fix vllm launch --- .github/workflows/70b-tmpl.yml | 154 +++++++++++------------ .github/workflows/workflow-scheduler.yml | 102 +++++++-------- benchmarks/70b_b200_slurm.sh | 3 +- runners/launch_b200-nv.sh | 2 +- 4 files changed, 130 insertions(+), 131 deletions(-) diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml index faa51d369..83717ede9 100644 --- a/.github/workflows/70b-tmpl.yml +++ b/.github/workflows/70b-tmpl.yml @@ -30,37 +30,37 @@ jobs: - name: Find the latest Docker image run: echo "Hardcoding image tags for now." - bmk-h100: - needs: find-latest-image - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - runner: h100 - image: 'kedarpotdar147/vllm0.1:latest' - model: 'nvidia/Llama-3.3-70B-Instruct-FP8' - tp-list: '[2, 4, 8]' - timeout: ${{ inputs.timeout }} + # bmk-h100: + # needs: find-latest-image + # uses: ./.github/workflows/benchmark-tmpl.yml + # secrets: inherit + # with: + # exp-name: ${{ inputs.exp-name }} + # isl: ${{ inputs.isl }} + # osl: ${{ inputs.osl }} + # max-model-len: ${{ inputs.max-model-len }} + # random-range-ratio: ${{ inputs.random-range-ratio }} + # runner: h100 + # image: 'kedarpotdar147/vllm0.1:latest' + # model: 'nvidia/Llama-3.3-70B-Instruct-FP8' + # tp-list: '[2, 4, 8]' + # timeout: ${{ inputs.timeout }} - bmk-h200: - needs: find-latest-image - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - runner: h200 - image: 'kedarpotdar147/vllm0.1:latest' - model: 'nvidia/Llama-3.3-70B-Instruct-FP8' - tp-list: '[1, 2, 4, 8]' - timeout: ${{ inputs.timeout }} + # bmk-h200: + # needs: find-latest-image + # uses: ./.github/workflows/benchmark-tmpl.yml + # secrets: inherit + # with: + # exp-name: ${{ inputs.exp-name }} + # isl: ${{ inputs.isl }} + # osl: ${{ inputs.osl }} + # max-model-len: ${{ inputs.max-model-len }} + # random-range-ratio: ${{ inputs.random-range-ratio }} + # runner: h200 + # image: 'kedarpotdar147/vllm0.1:latest' + # model: 'nvidia/Llama-3.3-70B-Instruct-FP8' + # tp-list: '[1, 2, 4, 8]' + # timeout: ${{ inputs.timeout }} bmk-b200: needs: find-latest-image @@ -75,59 +75,59 @@ jobs: runner: b200 image: 'kedarpotdar147/vllm:05' model: 'nvidia/Llama-3.3-70B-Instruct-FP8' - tp-list: '[1, 2, 4, 8]' + tp-list: '[2]' timeout: ${{ inputs.timeout }} - bmk-mi300x: - needs: find-latest-image - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - runner: mi300x - image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718' - model: 'amd/Llama-3.3-70B-Instruct-FP8-KV' - tp-list: '[1, 2, 4, 8]' - timeout: ${{ inputs.timeout }} + # bmk-mi300x: + # needs: find-latest-image + # uses: ./.github/workflows/benchmark-tmpl.yml + # secrets: inherit + # with: + # exp-name: ${{ inputs.exp-name }} + # isl: ${{ inputs.isl }} + # osl: ${{ inputs.osl }} + # max-model-len: ${{ inputs.max-model-len }} + # random-range-ratio: ${{ inputs.random-range-ratio }} + # runner: mi300x + # image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718' + # model: 'amd/Llama-3.3-70B-Instruct-FP8-KV' + # tp-list: '[1, 2, 4, 8]' + # timeout: ${{ inputs.timeout }} - bmk-mi325x: - needs: find-latest-image - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - runner: mi325x - image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718' - model: 'amd/Llama-3.3-70B-Instruct-FP8-KV' - tp-list: '[1, 2, 4, 8]' - timeout: ${{ inputs.timeout }} + # bmk-mi325x: + # needs: find-latest-image + # uses: ./.github/workflows/benchmark-tmpl.yml + # secrets: inherit + # with: + # exp-name: ${{ inputs.exp-name }} + # isl: ${{ inputs.isl }} + # osl: ${{ inputs.osl }} + # max-model-len: ${{ inputs.max-model-len }} + # random-range-ratio: ${{ inputs.random-range-ratio }} + # runner: mi325x + # image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718' + # model: 'amd/Llama-3.3-70B-Instruct-FP8-KV' + # tp-list: '[1, 2, 4, 8]' + # timeout: ${{ inputs.timeout }} - bmk-mi355x: - needs: find-latest-image - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - runner: mi355x - image: 'rocm/7.0-preview:rocm7.0_preview_ubuntu_22.04_vllm_0.9.1_mi35x_alpha' - model: 'amd/Llama-3.3-70B-Instruct-FP8-KV' - tp-list: '[1, 2]' - timeout: ${{ inputs.timeout }} + # bmk-mi355x: + # needs: find-latest-image + # uses: ./.github/workflows/benchmark-tmpl.yml + # secrets: inherit + # with: + # exp-name: ${{ inputs.exp-name }} + # isl: ${{ inputs.isl }} + # osl: ${{ inputs.osl }} + # max-model-len: ${{ inputs.max-model-len }} + # random-range-ratio: ${{ inputs.random-range-ratio }} + # runner: mi355x + # image: 'rocm/7.0-preview:rocm7.0_preview_ubuntu_22.04_vllm_0.9.1_mi35x_alpha' + # model: 'amd/Llama-3.3-70B-Instruct-FP8-KV' + # tp-list: '[1, 2]' + # timeout: ${{ inputs.timeout }} collect-results: - needs: [bmk-h100, bmk-h200, bmk-b200, bmk-mi300x, bmk-mi325x, bmk-mi355x] + needs: [bmk-b200 ] if: ${{ always() && !cancelled() }} uses: ./.github/workflows/collect-results.yml secrets: inherit diff --git a/.github/workflows/workflow-scheduler.yml b/.github/workflows/workflow-scheduler.yml index 115452ddf..84b1cfc26 100644 --- a/.github/workflows/workflow-scheduler.yml +++ b/.github/workflows/workflow-scheduler.yml @@ -24,58 +24,58 @@ jobs: max-model-len: 2048 random-range-ratio: 0.8 - dsr1-1k1k: - needs: cleanup - uses: ./.github/workflows/dsr1-tmpl.yml - secrets: inherit - with: - exp-name: 'dsr1_1k1k' - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 + # dsr1-1k1k: + # needs: cleanup + # uses: ./.github/workflows/dsr1-tmpl.yml + # secrets: inherit + # with: + # exp-name: 'dsr1_1k1k' + # isl: 1024 + # osl: 1024 + # max-model-len: 2048 + # random-range-ratio: 0.8 - _70b-8k1k: - needs: cleanup - uses: ./.github/workflows/70b-tmpl.yml - secrets: inherit - with: - exp-name: '70b_8k1k' - isl: 8192 - osl: 1024 - max-model-len: 9216 - random-range-ratio: 0.8 + # _70b-8k1k: + # needs: cleanup + # uses: ./.github/workflows/70b-tmpl.yml + # secrets: inherit + # with: + # exp-name: '70b_8k1k' + # isl: 8192 + # osl: 1024 + # max-model-len: 9216 + # random-range-ratio: 0.8 - dsr1-8k1k: - needs: cleanup - uses: ./.github/workflows/dsr1-tmpl.yml - secrets: inherit - with: - exp-name: 'dsr1_8k1k' - isl: 8192 - osl: 1024 - max-model-len: 9216 - random-range-ratio: 0.8 + # dsr1-8k1k: + # needs: cleanup + # uses: ./.github/workflows/dsr1-tmpl.yml + # secrets: inherit + # with: + # exp-name: 'dsr1_8k1k' + # isl: 8192 + # osl: 1024 + # max-model-len: 9216 + # random-range-ratio: 0.8 - _70b-1k8k: - needs: cleanup - uses: ./.github/workflows/70b-tmpl.yml - secrets: inherit - with: - exp-name: '70b_1k8k' - isl: 1024 - osl: 8192 - max-model-len: 9216 - random-range-ratio: 0.8 - timeout: 240 + # _70b-1k8k: + # needs: cleanup + # uses: ./.github/workflows/70b-tmpl.yml + # secrets: inherit + # with: + # exp-name: '70b_1k8k' + # isl: 1024 + # osl: 8192 + # max-model-len: 9216 + # random-range-ratio: 0.8 + # timeout: 240 - dsr1-1k8k: - needs: cleanup - uses: ./.github/workflows/dsr1-tmpl.yml - secrets: inherit - with: - exp-name: 'dsr1_1k8k' - isl: 1024 - osl: 8192 - max-model-len: 9216 - random-range-ratio: 0.8 + # dsr1-1k8k: + # needs: cleanup + # uses: ./.github/workflows/dsr1-tmpl.yml + # secrets: inherit + # with: + # exp-name: 'dsr1_1k8k' + # isl: 1024 + # osl: 8192 + # max-model-len: 9216 + # random-range-ratio: 0.8 diff --git a/benchmarks/70b_b200_slurm.sh b/benchmarks/70b_b200_slurm.sh index f11133cc4..a07a3070e 100644 --- a/benchmarks/70b_b200_slurm.sh +++ b/benchmarks/70b_b200_slurm.sh @@ -20,8 +20,7 @@ hf download $MODEL SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=$(( 8888 + $PORT_OFFSET )) -pip install "git+https://github.com/flashinfer-ai/flashinfer.git@9720182476ede910698f8d783c29b2ec91cec023#egg=flashinfer-python" -pip install --upgrade --no-deps nvidia-nccl-cu12==2.26.2.post1 +pip install flashinfer-python==0.3.0 export TORCH_CUDA_ARCH_LIST="10.0" export VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB='{"2":32,"4":32,"8":8}' diff --git a/runners/launch_b200-nv.sh b/runners/launch_b200-nv.sh index 2808758db..e2e21d066 100644 --- a/runners/launch_b200-nv.sh +++ b/runners/launch_b200-nv.sh @@ -5,7 +5,7 @@ export PORT_OFFSET=${USER: -1} MODEL_CODE="${1%%_*}" PARTITION="dgx-b200" -SQUASH_FILE="/raid/image_${MODEL_CODE}_b200-2.sqsh" +SQUASH_FILE="/raid/image_${MODEL_CODE}_b200-vllm-3.sqsh" salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell JOB_ID=$(squeue -u $USER -h -o %A | head -n1) From 7e6577b06a74b65a362cb9dafbbeb754cfd942ef Mon Sep 17 00:00:00 2001 From: Kedar Potdar Date: Wed, 3 Sep 2025 09:34:55 -0700 Subject: [PATCH 2/6] re-enable dsr1 and update image ID to re-fetch --- .github/workflows/dsr1-tmpl.yml | 122 +++++++++++------------ .github/workflows/workflow-scheduler.yml | 20 ++-- runners/launch_b200-nv.sh | 2 +- 3 files changed, 72 insertions(+), 72 deletions(-) diff --git a/.github/workflows/dsr1-tmpl.yml b/.github/workflows/dsr1-tmpl.yml index 07030c387..14872b044 100644 --- a/.github/workflows/dsr1-tmpl.yml +++ b/.github/workflows/dsr1-tmpl.yml @@ -30,21 +30,21 @@ jobs: - name: Find the latest Docker image run: echo "Hardcoding image tags for now." - bmk-h200: - needs: find-latest-image - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - runner: h200 - image: 'lmsysorg/sglang:v0.4.9.post1-cu126' - model: 'deepseek-ai/DeepSeek-R1-0528' - tp-list: '[8]' - timeout: ${{ inputs.timeout }} + # bmk-h200: + # needs: find-latest-image + # uses: ./.github/workflows/benchmark-tmpl.yml + # secrets: inherit + # with: + # exp-name: ${{ inputs.exp-name }} + # isl: ${{ inputs.isl }} + # osl: ${{ inputs.osl }} + # max-model-len: ${{ inputs.max-model-len }} + # random-range-ratio: ${{ inputs.random-range-ratio }} + # runner: h200 + # image: 'lmsysorg/sglang:v0.4.9.post1-cu126' + # model: 'deepseek-ai/DeepSeek-R1-0528' + # tp-list: '[8]' + # timeout: ${{ inputs.timeout }} bmk-b200: needs: find-latest-image @@ -62,56 +62,56 @@ jobs: tp-list: '[8]' timeout: ${{ inputs.timeout }} - bmk-mi300x: - needs: find-latest-image - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - runner: mi300x - image: 'lmsysorg/sglang:v0.4.9.post2-rocm630-mi30x' - model: 'deepseek-ai/DeepSeek-R1-0528' - tp-list: '[8]' - timeout: ${{ inputs.timeout }} + # bmk-mi300x: + # needs: find-latest-image + # uses: ./.github/workflows/benchmark-tmpl.yml + # secrets: inherit + # with: + # exp-name: ${{ inputs.exp-name }} + # isl: ${{ inputs.isl }} + # osl: ${{ inputs.osl }} + # max-model-len: ${{ inputs.max-model-len }} + # random-range-ratio: ${{ inputs.random-range-ratio }} + # runner: mi300x + # image: 'lmsysorg/sglang:v0.4.9.post2-rocm630-mi30x' + # model: 'deepseek-ai/DeepSeek-R1-0528' + # tp-list: '[8]' + # timeout: ${{ inputs.timeout }} - bmk-mi325x: - needs: find-latest-image - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - runner: mi325x - image: 'lmsysorg/sglang:v0.4.9.post2-rocm630-mi30x' - model: 'deepseek-ai/DeepSeek-R1-0528' - tp-list: '[8]' - timeout: ${{ inputs.timeout }} + # bmk-mi325x: + # needs: find-latest-image + # uses: ./.github/workflows/benchmark-tmpl.yml + # secrets: inherit + # with: + # exp-name: ${{ inputs.exp-name }} + # isl: ${{ inputs.isl }} + # osl: ${{ inputs.osl }} + # max-model-len: ${{ inputs.max-model-len }} + # random-range-ratio: ${{ inputs.random-range-ratio }} + # runner: mi325x + # image: 'lmsysorg/sglang:v0.4.9.post2-rocm630-mi30x' + # model: 'deepseek-ai/DeepSeek-R1-0528' + # tp-list: '[8]' + # timeout: ${{ inputs.timeout }} - bmk-mi355x: - needs: find-latest-image - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - runner: mi355x - image: 'lmsysorg/sglang:v0.5.1.post2-rocm700-mi35x' - model: 'deepseek-ai/DeepSeek-R1-0528' - tp-list: '[8]' - timeout: ${{ inputs.timeout }} + # bmk-mi355x: + # needs: find-latest-image + # uses: ./.github/workflows/benchmark-tmpl.yml + # secrets: inherit + # with: + # exp-name: ${{ inputs.exp-name }} + # isl: ${{ inputs.isl }} + # osl: ${{ inputs.osl }} + # max-model-len: ${{ inputs.max-model-len }} + # random-range-ratio: ${{ inputs.random-range-ratio }} + # runner: mi355x + # image: 'lmsysorg/sglang:v0.5.1.post2-rocm700-mi35x' + # model: 'deepseek-ai/DeepSeek-R1-0528' + # tp-list: '[8]' + # timeout: ${{ inputs.timeout }} collect-results: - needs: [bmk-h200, bmk-b200, bmk-mi300x, bmk-mi325x, bmk-mi355x] + needs: [ bmk-b200,] if: ${{ always() && !cancelled() }} uses: ./.github/workflows/collect-results.yml secrets: inherit diff --git a/.github/workflows/workflow-scheduler.yml b/.github/workflows/workflow-scheduler.yml index 84b1cfc26..a0964526b 100644 --- a/.github/workflows/workflow-scheduler.yml +++ b/.github/workflows/workflow-scheduler.yml @@ -24,16 +24,16 @@ jobs: max-model-len: 2048 random-range-ratio: 0.8 - # dsr1-1k1k: - # needs: cleanup - # uses: ./.github/workflows/dsr1-tmpl.yml - # secrets: inherit - # with: - # exp-name: 'dsr1_1k1k' - # isl: 1024 - # osl: 1024 - # max-model-len: 2048 - # random-range-ratio: 0.8 + dsr1-1k1k: + needs: cleanup + uses: ./.github/workflows/dsr1-tmpl.yml + secrets: inherit + with: + exp-name: 'dsr1_1k1k' + isl: 1024 + osl: 1024 + max-model-len: 2048 + random-range-ratio: 0.8 # _70b-8k1k: # needs: cleanup diff --git a/runners/launch_b200-nv.sh b/runners/launch_b200-nv.sh index e2e21d066..8bf679d58 100644 --- a/runners/launch_b200-nv.sh +++ b/runners/launch_b200-nv.sh @@ -5,7 +5,7 @@ export PORT_OFFSET=${USER: -1} MODEL_CODE="${1%%_*}" PARTITION="dgx-b200" -SQUASH_FILE="/raid/image_${MODEL_CODE}_b200-vllm-3.sqsh" +SQUASH_FILE="/raid/image_${MODEL_CODE}_b200-0903.sqsh" salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell JOB_ID=$(squeue -u $USER -h -o %A | head -n1) From 22c9710ba61b163eb4540d1fd909bcd4c2b968fd Mon Sep 17 00:00:00 2001 From: Kedar Potdar Date: Wed, 3 Sep 2025 14:55:14 -0700 Subject: [PATCH 3/6] rollback dsr1 --- .github/workflows/dsr1-tmpl.yml | 2 +- benchmarks/dsr1_b200_docker.sh | 72 ++++++++++++++++++++++++++------- benchmarks/dsr1_b200_slurm.sh | 9 ++--- 3 files changed, 62 insertions(+), 21 deletions(-) diff --git a/.github/workflows/dsr1-tmpl.yml b/.github/workflows/dsr1-tmpl.yml index 14872b044..811c2c422 100644 --- a/.github/workflows/dsr1-tmpl.yml +++ b/.github/workflows/dsr1-tmpl.yml @@ -57,7 +57,7 @@ jobs: max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} runner: b200 - image: 'lmsysorg/sglang:v0.5.0rc1-cu128-b200' + image: 'lmsysorg/sglang:v0.4.10.post1-cu128-b200' model: 'deepseek-ai/DeepSeek-R1-0528' tp-list: '[8]' timeout: ${{ inputs.timeout }} diff --git a/benchmarks/dsr1_b200_docker.sh b/benchmarks/dsr1_b200_docker.sh index 68f83a169..df65471f6 100644 --- a/benchmarks/dsr1_b200_docker.sh +++ b/benchmarks/dsr1_b200_docker.sh @@ -1,20 +1,62 @@ #!/usr/bin/env bash -# ========= Required Env Vars ========= -# HF_TOKEN -# HF_HUB_CACHE -# MODEL -# PORT -# TP -# CONC -# MAX_MODEL_LEN - -export SGL_ENABLE_JIT_DEEPGEMM=0 -export SGLANG_ENABLE_FLASHINFER_GEMM=1 +while [ -n "$(docker ps -aq)" ]; do + docker rm -f $(docker ps -aq) + docker network prune -f + sleep 5 +done + +network_name="bmk-net" +server_name="bmk-server" +client_name="bmk-client" +port=8888 + +docker network create $network_name set -x -python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT --trust-remote-code \ +docker run --rm -d --network $network_name --name $server_name \ +--runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ +-v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ +-e HF_TOKEN=$HF_TOKEN -e HF_HUB_CACHE=$HF_HUB_CACHE -e SGL_ENABLE_JIT_DEEPGEMM=0 \ +--entrypoint=python3 \ +$IMAGE \ +-m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $port --trust-remote-code \ --tensor-parallel-size=$TP --data-parallel-size=1 \ ---kv-cache-dtype=fp8_e4m3 --mem-fraction-static=0.82 \ ---max-prefill-tokens=32768 --chunked-prefill-size=32768 --cuda-graph-max-bs=128 --max-running-requests=128 \ ---disable-radix-cache --enable-flashinfer-trtllm-moe --attention-backend=trtllm_mla --stream-interval=1 +--cuda-graph-max-bs 256 --max-running-requests 512 --mem-fraction-static 0.89 \ +--chunked-prefill-size 32768 --max-prefill-tokens 32768 \ +--disable-radix-cache --attention-backend trtllm_mla --disable-shared-experts-fusion --enable-flashinfer-trtllm-moe + +set +x +while IFS= read -r line; do + printf '%s\n' "$line" + if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then + docker stop $server_name + exit 1 + fi + if [[ "$line" == *"The server is fired up and ready to roll!"* ]]; then + break + fi +done < <(docker logs -f --tail=0 $server_name 2>&1) + +git clone https://github.com/kimbochen/bench_serving.git + +set -x +docker run --rm --network $network_name --name $client_name \ +-v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ -e HF_TOKEN=$HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ +--entrypoint=python3 \ +$IMAGE \ +bench_serving/benchmark_serving.py \ +--model $MODEL --backend vllm --base-url http://$server_name:$port \ +--dataset-name random \ +--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ +--num-prompts $(( $CONC * 10 )) \ +--max-concurrency $CONC \ +--request-rate inf --ignore-eos \ +--save-result --percentile-metrics "ttft,tpot,itl,e2el" \ +--result-dir /workspace/ --result-filename $RESULT_FILENAME.json + +while [ -n "$(docker ps -aq)" ]; do + docker stop $server_name + docker network rm $network_name + sleep 5 +done \ No newline at end of file diff --git a/benchmarks/dsr1_b200_slurm.sh b/benchmarks/dsr1_b200_slurm.sh index 2aa45be79..9a1ba6271 100644 --- a/benchmarks/dsr1_b200_slurm.sh +++ b/benchmarks/dsr1_b200_slurm.sh @@ -7,13 +7,12 @@ SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) set -x PORT=$(( 8888 + $PORT_OFFSET )) -export SGL_ENABLE_JIT_DEEPGEMM=false -export SGLANG_ENABLE_FLASHINFER_GEMM=true +export SGL_ENABLE_JIT_DEEPGEMM=0 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \ --tensor-parallel-size=$TP --data-parallel-size=1 \ ---cuda-graph-max-bs 128 --max-running-requests 128 --mem-fraction-static 0.82 --kv-cache-dtype fp8_e4m3 \ +--cuda-graph-max-bs 256 --max-running-requests 512 --mem-fraction-static 0.89 \ --chunked-prefill-size 32768 --max-prefill-tokens 32768 \ ---disable-radix-cache --attention-backend trtllm_mla --enable-flashinfer-trtllm-moe --stream-interval 1 \ +--disable-radix-cache --attention-backend trtllm_mla --disable-shared-experts-fusion --enable-flashinfer-trtllm-moe \ > $SERVER_LOG 2>&1 & set +x @@ -41,4 +40,4 @@ python3 bench_serving/benchmark_serving.py \ --request-rate inf --ignore-eos \ --save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ --result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json +--result-filename $RESULT_FILENAME.json \ No newline at end of file From 492de4cd041a7b90484bdba115d76126486b2ec7 Mon Sep 17 00:00:00 2001 From: Kedar Potdar Date: Wed, 3 Sep 2025 15:22:11 -0700 Subject: [PATCH 4/6] fix dsr1, remove 70b --- .github/workflows/70b-tmpl.yml | 30 +++++++------- .github/workflows/dsr1-tmpl.yml | 2 +- benchmarks/dsr1_b200_docker.sh | 72 +++++++-------------------------- benchmarks/dsr1_b200_slurm.sh | 43 +++++++++++++------- 4 files changed, 59 insertions(+), 88 deletions(-) diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml index 83717ede9..73ad841cf 100644 --- a/.github/workflows/70b-tmpl.yml +++ b/.github/workflows/70b-tmpl.yml @@ -62,21 +62,21 @@ jobs: # tp-list: '[1, 2, 4, 8]' # timeout: ${{ inputs.timeout }} - bmk-b200: - needs: find-latest-image - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - runner: b200 - image: 'kedarpotdar147/vllm:05' - model: 'nvidia/Llama-3.3-70B-Instruct-FP8' - tp-list: '[2]' - timeout: ${{ inputs.timeout }} + # bmk-b200: + # needs: find-latest-image + # uses: ./.github/workflows/benchmark-tmpl.yml + # secrets: inherit + # with: + # exp-name: ${{ inputs.exp-name }} + # isl: ${{ inputs.isl }} + # osl: ${{ inputs.osl }} + # max-model-len: ${{ inputs.max-model-len }} + # random-range-ratio: ${{ inputs.random-range-ratio }} + # runner: b200 + # image: 'kedarpotdar147/vllm:05' + # model: 'nvidia/Llama-3.3-70B-Instruct-FP8' + # tp-list: '[2]' + # timeout: ${{ inputs.timeout }} # bmk-mi300x: # needs: find-latest-image diff --git a/.github/workflows/dsr1-tmpl.yml b/.github/workflows/dsr1-tmpl.yml index 811c2c422..14872b044 100644 --- a/.github/workflows/dsr1-tmpl.yml +++ b/.github/workflows/dsr1-tmpl.yml @@ -57,7 +57,7 @@ jobs: max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} runner: b200 - image: 'lmsysorg/sglang:v0.4.10.post1-cu128-b200' + image: 'lmsysorg/sglang:v0.5.0rc1-cu128-b200' model: 'deepseek-ai/DeepSeek-R1-0528' tp-list: '[8]' timeout: ${{ inputs.timeout }} diff --git a/benchmarks/dsr1_b200_docker.sh b/benchmarks/dsr1_b200_docker.sh index df65471f6..68f83a169 100644 --- a/benchmarks/dsr1_b200_docker.sh +++ b/benchmarks/dsr1_b200_docker.sh @@ -1,62 +1,20 @@ #!/usr/bin/env bash -while [ -n "$(docker ps -aq)" ]; do - docker rm -f $(docker ps -aq) - docker network prune -f - sleep 5 -done - -network_name="bmk-net" -server_name="bmk-server" -client_name="bmk-client" -port=8888 - -docker network create $network_name +# ========= Required Env Vars ========= +# HF_TOKEN +# HF_HUB_CACHE +# MODEL +# PORT +# TP +# CONC +# MAX_MODEL_LEN + +export SGL_ENABLE_JIT_DEEPGEMM=0 +export SGLANG_ENABLE_FLASHINFER_GEMM=1 set -x -docker run --rm -d --network $network_name --name $server_name \ ---runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ --v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --e HF_TOKEN=$HF_TOKEN -e HF_HUB_CACHE=$HF_HUB_CACHE -e SGL_ENABLE_JIT_DEEPGEMM=0 \ ---entrypoint=python3 \ -$IMAGE \ --m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $port --trust-remote-code \ +python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT --trust-remote-code \ --tensor-parallel-size=$TP --data-parallel-size=1 \ ---cuda-graph-max-bs 256 --max-running-requests 512 --mem-fraction-static 0.89 \ ---chunked-prefill-size 32768 --max-prefill-tokens 32768 \ ---disable-radix-cache --attention-backend trtllm_mla --disable-shared-experts-fusion --enable-flashinfer-trtllm-moe - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then - docker stop $server_name - exit 1 - fi - if [[ "$line" == *"The server is fired up and ready to roll!"* ]]; then - break - fi -done < <(docker logs -f --tail=0 $server_name 2>&1) - -git clone https://github.com/kimbochen/bench_serving.git - -set -x -docker run --rm --network $network_name --name $client_name \ --v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ -e HF_TOKEN=$HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ ---entrypoint=python3 \ -$IMAGE \ -bench_serving/benchmark_serving.py \ ---model $MODEL --backend vllm --base-url http://$server_name:$port \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) \ ---max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics "ttft,tpot,itl,e2el" \ ---result-dir /workspace/ --result-filename $RESULT_FILENAME.json - -while [ -n "$(docker ps -aq)" ]; do - docker stop $server_name - docker network rm $network_name - sleep 5 -done \ No newline at end of file +--kv-cache-dtype=fp8_e4m3 --mem-fraction-static=0.82 \ +--max-prefill-tokens=32768 --chunked-prefill-size=32768 --cuda-graph-max-bs=128 --max-running-requests=128 \ +--disable-radix-cache --enable-flashinfer-trtllm-moe --attention-backend=trtllm_mla --stream-interval=1 diff --git a/benchmarks/dsr1_b200_slurm.sh b/benchmarks/dsr1_b200_slurm.sh index 9a1ba6271..28e9e2a32 100644 --- a/benchmarks/dsr1_b200_slurm.sh +++ b/benchmarks/dsr1_b200_slurm.sh @@ -7,27 +7,40 @@ SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) set -x PORT=$(( 8888 + $PORT_OFFSET )) -export SGL_ENABLE_JIT_DEEPGEMM=0 +export SGL_ENABLE_JIT_DEEPGEMM=false +export SGLANG_ENABLE_FLASHINFER_GEMM=true python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \ --tensor-parallel-size=$TP --data-parallel-size=1 \ ---cuda-graph-max-bs 256 --max-running-requests 512 --mem-fraction-static 0.89 \ +--cuda-graph-max-bs 128 --max-running-requests 128 --mem-fraction-static 0.82 --kv-cache-dtype fp8_e4m3 \ --chunked-prefill-size 32768 --max-prefill-tokens 32768 \ ---disable-radix-cache --attention-backend trtllm_mla --disable-shared-experts-fusion --enable-flashinfer-trtllm-moe \ +--disable-radix-cache --attention-backend trtllm_mla --enable-flashinfer-trtllm-moe --stream-interval 1 \ > $SERVER_LOG 2>&1 & set +x +IGNORE_PAT="Ignore import error when loading sglang.srt.models.glm4v_moe: No module named 'transformers.models.glm4v_moe'" + while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then - sleep 5 - tail -n100 $SERVER_LOG - echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME" - exit 1 - fi - if [[ "$line" == *"The server is fired up and ready to roll"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") + printf '%s\n' "$line" + + # Skip the known benign "Ignore import error ..." line + if [[ "$line" == *"$IGNORE_PAT"* ]]; then + continue + fi + + # Keep your original "error" trap for everything else + if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then + sleep 5 + tail -n100 "$SERVER_LOG" + echo "JOB ${SLURM_JOB_ID:-NA} ran on NODE ${SLURMD_NODENAME:-unknown}" + exit 1 + fi + + # Break when server is ready + if [[ "$line" == *"The server is fired up and ready to roll"* ]]; then + break + fi +# Start tail from the beginning so we don't miss early lines +done < <(tail -n +1 -F "$SERVER_LOG") set -x git clone https://github.com/kimbochen/bench_serving.git @@ -40,4 +53,4 @@ python3 bench_serving/benchmark_serving.py \ --request-rate inf --ignore-eos \ --save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ --result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json \ No newline at end of file +--result-filename $RESULT_FILENAME.json From 2e21fe9f309a867bd8315ce769f88ead4711feca Mon Sep 17 00:00:00 2001 From: Kedar Potdar Date: Wed, 3 Sep 2025 15:23:24 -0700 Subject: [PATCH 5/6] readd 70b --- .github/workflows/70b-tmpl.yml | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml index 73ad841cf..3a2726bbe 100644 --- a/.github/workflows/70b-tmpl.yml +++ b/.github/workflows/70b-tmpl.yml @@ -62,21 +62,21 @@ jobs: # tp-list: '[1, 2, 4, 8]' # timeout: ${{ inputs.timeout }} - # bmk-b200: - # needs: find-latest-image - # uses: ./.github/workflows/benchmark-tmpl.yml - # secrets: inherit - # with: - # exp-name: ${{ inputs.exp-name }} - # isl: ${{ inputs.isl }} - # osl: ${{ inputs.osl }} - # max-model-len: ${{ inputs.max-model-len }} - # random-range-ratio: ${{ inputs.random-range-ratio }} - # runner: b200 - # image: 'kedarpotdar147/vllm:05' - # model: 'nvidia/Llama-3.3-70B-Instruct-FP8' - # tp-list: '[2]' - # timeout: ${{ inputs.timeout }} + bmk-b200: + needs: find-latest-image + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + exp-name: ${{ inputs.exp-name }} + isl: ${{ inputs.isl }} + osl: ${{ inputs.osl }} + max-model-len: ${{ inputs.max-model-len }} + random-range-ratio: ${{ inputs.random-range-ratio }} + runner: b200 + image: 'kedarpotdar147/vllm:05' + model: 'nvidia/Llama-3.3-70B-Instruct-FP8' + tp-list: '[8]' + timeout: ${{ inputs.timeout }} # bmk-mi300x: # needs: find-latest-image From 594bc88792e3bdb13110be465d41cb1c9e73bf20 Mon Sep 17 00:00:00 2001 From: Kedar Potdar Date: Wed, 3 Sep 2025 16:15:44 -0700 Subject: [PATCH 6/6] re-add other tests --- .github/workflows/70b-tmpl.yml | 154 ++++++++++++++++---------------- .github/workflows/dsr1-tmpl.yml | 122 ++++++++++++------------- 2 files changed, 138 insertions(+), 138 deletions(-) diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml index 3a2726bbe..921909434 100644 --- a/.github/workflows/70b-tmpl.yml +++ b/.github/workflows/70b-tmpl.yml @@ -30,37 +30,37 @@ jobs: - name: Find the latest Docker image run: echo "Hardcoding image tags for now." - # bmk-h100: - # needs: find-latest-image - # uses: ./.github/workflows/benchmark-tmpl.yml - # secrets: inherit - # with: - # exp-name: ${{ inputs.exp-name }} - # isl: ${{ inputs.isl }} - # osl: ${{ inputs.osl }} - # max-model-len: ${{ inputs.max-model-len }} - # random-range-ratio: ${{ inputs.random-range-ratio }} - # runner: h100 - # image: 'kedarpotdar147/vllm0.1:latest' - # model: 'nvidia/Llama-3.3-70B-Instruct-FP8' - # tp-list: '[2, 4, 8]' - # timeout: ${{ inputs.timeout }} + bmk-h100: + needs: find-latest-image + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + exp-name: ${{ inputs.exp-name }} + isl: ${{ inputs.isl }} + osl: ${{ inputs.osl }} + max-model-len: ${{ inputs.max-model-len }} + random-range-ratio: ${{ inputs.random-range-ratio }} + runner: h100 + image: 'kedarpotdar147/vllm0.1:latest' + model: 'nvidia/Llama-3.3-70B-Instruct-FP8' + tp-list: '[2, 4, 8]' + timeout: ${{ inputs.timeout }} - # bmk-h200: - # needs: find-latest-image - # uses: ./.github/workflows/benchmark-tmpl.yml - # secrets: inherit - # with: - # exp-name: ${{ inputs.exp-name }} - # isl: ${{ inputs.isl }} - # osl: ${{ inputs.osl }} - # max-model-len: ${{ inputs.max-model-len }} - # random-range-ratio: ${{ inputs.random-range-ratio }} - # runner: h200 - # image: 'kedarpotdar147/vllm0.1:latest' - # model: 'nvidia/Llama-3.3-70B-Instruct-FP8' - # tp-list: '[1, 2, 4, 8]' - # timeout: ${{ inputs.timeout }} + bmk-h200: + needs: find-latest-image + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + exp-name: ${{ inputs.exp-name }} + isl: ${{ inputs.isl }} + osl: ${{ inputs.osl }} + max-model-len: ${{ inputs.max-model-len }} + random-range-ratio: ${{ inputs.random-range-ratio }} + runner: h200 + image: 'kedarpotdar147/vllm0.1:latest' + model: 'nvidia/Llama-3.3-70B-Instruct-FP8' + tp-list: '[1, 2, 4, 8]' + timeout: ${{ inputs.timeout }} bmk-b200: needs: find-latest-image @@ -75,59 +75,59 @@ jobs: runner: b200 image: 'kedarpotdar147/vllm:05' model: 'nvidia/Llama-3.3-70B-Instruct-FP8' - tp-list: '[8]' + tp-list: '[1,2]' timeout: ${{ inputs.timeout }} - # bmk-mi300x: - # needs: find-latest-image - # uses: ./.github/workflows/benchmark-tmpl.yml - # secrets: inherit - # with: - # exp-name: ${{ inputs.exp-name }} - # isl: ${{ inputs.isl }} - # osl: ${{ inputs.osl }} - # max-model-len: ${{ inputs.max-model-len }} - # random-range-ratio: ${{ inputs.random-range-ratio }} - # runner: mi300x - # image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718' - # model: 'amd/Llama-3.3-70B-Instruct-FP8-KV' - # tp-list: '[1, 2, 4, 8]' - # timeout: ${{ inputs.timeout }} + bmk-mi300x: + needs: find-latest-image + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + exp-name: ${{ inputs.exp-name }} + isl: ${{ inputs.isl }} + osl: ${{ inputs.osl }} + max-model-len: ${{ inputs.max-model-len }} + random-range-ratio: ${{ inputs.random-range-ratio }} + runner: mi300x + image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718' + model: 'amd/Llama-3.3-70B-Instruct-FP8-KV' + tp-list: '[1, 2, 4, 8]' + timeout: ${{ inputs.timeout }} - # bmk-mi325x: - # needs: find-latest-image - # uses: ./.github/workflows/benchmark-tmpl.yml - # secrets: inherit - # with: - # exp-name: ${{ inputs.exp-name }} - # isl: ${{ inputs.isl }} - # osl: ${{ inputs.osl }} - # max-model-len: ${{ inputs.max-model-len }} - # random-range-ratio: ${{ inputs.random-range-ratio }} - # runner: mi325x - # image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718' - # model: 'amd/Llama-3.3-70B-Instruct-FP8-KV' - # tp-list: '[1, 2, 4, 8]' - # timeout: ${{ inputs.timeout }} + bmk-mi325x: + needs: find-latest-image + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + exp-name: ${{ inputs.exp-name }} + isl: ${{ inputs.isl }} + osl: ${{ inputs.osl }} + max-model-len: ${{ inputs.max-model-len }} + random-range-ratio: ${{ inputs.random-range-ratio }} + runner: mi325x + image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718' + model: 'amd/Llama-3.3-70B-Instruct-FP8-KV' + tp-list: '[1, 2, 4, 8]' + timeout: ${{ inputs.timeout }} - # bmk-mi355x: - # needs: find-latest-image - # uses: ./.github/workflows/benchmark-tmpl.yml - # secrets: inherit - # with: - # exp-name: ${{ inputs.exp-name }} - # isl: ${{ inputs.isl }} - # osl: ${{ inputs.osl }} - # max-model-len: ${{ inputs.max-model-len }} - # random-range-ratio: ${{ inputs.random-range-ratio }} - # runner: mi355x - # image: 'rocm/7.0-preview:rocm7.0_preview_ubuntu_22.04_vllm_0.9.1_mi35x_alpha' - # model: 'amd/Llama-3.3-70B-Instruct-FP8-KV' - # tp-list: '[1, 2]' - # timeout: ${{ inputs.timeout }} + bmk-mi355x: + needs: find-latest-image + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + exp-name: ${{ inputs.exp-name }} + isl: ${{ inputs.isl }} + osl: ${{ inputs.osl }} + max-model-len: ${{ inputs.max-model-len }} + random-range-ratio: ${{ inputs.random-range-ratio }} + runner: mi355x + image: 'rocm/7.0-preview:rocm7.0_preview_ubuntu_22.04_vllm_0.9.1_mi35x_alpha' + model: 'amd/Llama-3.3-70B-Instruct-FP8-KV' + tp-list: '[1, 2]' + timeout: ${{ inputs.timeout }} collect-results: - needs: [bmk-b200 ] + needs: [bmk-h100,bmk-h200,bmk-b200, bmk-mi300x, bmk-mi325x, bmk-mi355x] if: ${{ always() && !cancelled() }} uses: ./.github/workflows/collect-results.yml secrets: inherit diff --git a/.github/workflows/dsr1-tmpl.yml b/.github/workflows/dsr1-tmpl.yml index 14872b044..59e81c38b 100644 --- a/.github/workflows/dsr1-tmpl.yml +++ b/.github/workflows/dsr1-tmpl.yml @@ -30,21 +30,21 @@ jobs: - name: Find the latest Docker image run: echo "Hardcoding image tags for now." - # bmk-h200: - # needs: find-latest-image - # uses: ./.github/workflows/benchmark-tmpl.yml - # secrets: inherit - # with: - # exp-name: ${{ inputs.exp-name }} - # isl: ${{ inputs.isl }} - # osl: ${{ inputs.osl }} - # max-model-len: ${{ inputs.max-model-len }} - # random-range-ratio: ${{ inputs.random-range-ratio }} - # runner: h200 - # image: 'lmsysorg/sglang:v0.4.9.post1-cu126' - # model: 'deepseek-ai/DeepSeek-R1-0528' - # tp-list: '[8]' - # timeout: ${{ inputs.timeout }} + bmk-h200: + needs: find-latest-image + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + exp-name: ${{ inputs.exp-name }} + isl: ${{ inputs.isl }} + osl: ${{ inputs.osl }} + max-model-len: ${{ inputs.max-model-len }} + random-range-ratio: ${{ inputs.random-range-ratio }} + runner: h200 + image: 'lmsysorg/sglang:v0.4.9.post1-cu126' + model: 'deepseek-ai/DeepSeek-R1-0528' + tp-list: '[8]' + timeout: ${{ inputs.timeout }} bmk-b200: needs: find-latest-image @@ -62,56 +62,56 @@ jobs: tp-list: '[8]' timeout: ${{ inputs.timeout }} - # bmk-mi300x: - # needs: find-latest-image - # uses: ./.github/workflows/benchmark-tmpl.yml - # secrets: inherit - # with: - # exp-name: ${{ inputs.exp-name }} - # isl: ${{ inputs.isl }} - # osl: ${{ inputs.osl }} - # max-model-len: ${{ inputs.max-model-len }} - # random-range-ratio: ${{ inputs.random-range-ratio }} - # runner: mi300x - # image: 'lmsysorg/sglang:v0.4.9.post2-rocm630-mi30x' - # model: 'deepseek-ai/DeepSeek-R1-0528' - # tp-list: '[8]' - # timeout: ${{ inputs.timeout }} + bmk-mi300x: + needs: find-latest-image + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + exp-name: ${{ inputs.exp-name }} + isl: ${{ inputs.isl }} + osl: ${{ inputs.osl }} + max-model-len: ${{ inputs.max-model-len }} + random-range-ratio: ${{ inputs.random-range-ratio }} + runner: mi300x + image: 'lmsysorg/sglang:v0.4.9.post2-rocm630-mi30x' + model: 'deepseek-ai/DeepSeek-R1-0528' + tp-list: '[8]' + timeout: ${{ inputs.timeout }} - # bmk-mi325x: - # needs: find-latest-image - # uses: ./.github/workflows/benchmark-tmpl.yml - # secrets: inherit - # with: - # exp-name: ${{ inputs.exp-name }} - # isl: ${{ inputs.isl }} - # osl: ${{ inputs.osl }} - # max-model-len: ${{ inputs.max-model-len }} - # random-range-ratio: ${{ inputs.random-range-ratio }} - # runner: mi325x - # image: 'lmsysorg/sglang:v0.4.9.post2-rocm630-mi30x' - # model: 'deepseek-ai/DeepSeek-R1-0528' - # tp-list: '[8]' - # timeout: ${{ inputs.timeout }} + bmk-mi325x: + needs: find-latest-image + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + exp-name: ${{ inputs.exp-name }} + isl: ${{ inputs.isl }} + osl: ${{ inputs.osl }} + max-model-len: ${{ inputs.max-model-len }} + random-range-ratio: ${{ inputs.random-range-ratio }} + runner: mi325x + image: 'lmsysorg/sglang:v0.4.9.post2-rocm630-mi30x' + model: 'deepseek-ai/DeepSeek-R1-0528' + tp-list: '[8]' + timeout: ${{ inputs.timeout }} - # bmk-mi355x: - # needs: find-latest-image - # uses: ./.github/workflows/benchmark-tmpl.yml - # secrets: inherit - # with: - # exp-name: ${{ inputs.exp-name }} - # isl: ${{ inputs.isl }} - # osl: ${{ inputs.osl }} - # max-model-len: ${{ inputs.max-model-len }} - # random-range-ratio: ${{ inputs.random-range-ratio }} - # runner: mi355x - # image: 'lmsysorg/sglang:v0.5.1.post2-rocm700-mi35x' - # model: 'deepseek-ai/DeepSeek-R1-0528' - # tp-list: '[8]' - # timeout: ${{ inputs.timeout }} + bmk-mi355x: + needs: find-latest-image + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + exp-name: ${{ inputs.exp-name }} + isl: ${{ inputs.isl }} + osl: ${{ inputs.osl }} + max-model-len: ${{ inputs.max-model-len }} + random-range-ratio: ${{ inputs.random-range-ratio }} + runner: mi355x + image: 'lmsysorg/sglang:v0.5.1.post2-rocm700-mi35x' + model: 'deepseek-ai/DeepSeek-R1-0528' + tp-list: '[8]' + timeout: ${{ inputs.timeout }} collect-results: - needs: [ bmk-b200,] + needs: [ bmk-h200, bmk-b200, bmk-mi300x, bmk-mi325x, bmk-mi355x] if: ${{ always() && !cancelled() }} uses: ./.github/workflows/collect-results.yml secrets: inherit