diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml index 23ad88551..e889a364b 100644 --- a/.github/workflows/70b-tmpl.yml +++ b/.github/workflows/70b-tmpl.yml @@ -30,37 +30,37 @@ jobs: - name: Find the latest Docker image run: echo "Hardcoding image tags for now." - bmk-h100: - needs: find-latest-image - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - runner: h100 - image: 'kedarpotdar147/vllm0.1:latest' - model: 'nvidia/Llama-3.1-70B-Instruct-FP8' - tp-list: '[2, 4, 8]' - timeout: ${{ inputs.timeout }} + # bmk-h100: + # needs: find-latest-image + # uses: ./.github/workflows/benchmark-tmpl.yml + # secrets: inherit + # with: + # exp-name: ${{ inputs.exp-name }} + # isl: ${{ inputs.isl }} + # osl: ${{ inputs.osl }} + # max-model-len: ${{ inputs.max-model-len }} + # random-range-ratio: ${{ inputs.random-range-ratio }} + # runner: h100 + # image: 'kedarpotdar147/vllm0.1:latest' + # model: 'nvidia/Llama-3.1-70B-Instruct-FP8' + # tp-list: '[2]' + # timeout: ${{ inputs.timeout }} - bmk-h200: - needs: find-latest-image - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - runner: h200 - image: 'kedarpotdar147/vllm0.1:latest' - model: 'nvidia/Llama-3.1-70B-Instruct-FP8' - tp-list: '[1, 2, 4, 8]' - timeout: ${{ inputs.timeout }} + # bmk-h200: + # needs: find-latest-image + # uses: ./.github/workflows/benchmark-tmpl.yml + # secrets: inherit + # with: + # exp-name: ${{ inputs.exp-name }} + # isl: ${{ inputs.isl }} + # osl: ${{ inputs.osl }} + # max-model-len: ${{ inputs.max-model-len }} + # random-range-ratio: ${{ inputs.random-range-ratio }} + # runner: h200 + # image: 'kedarpotdar147/vllm0.1:latest' + # model: 'nvidia/Llama-3.1-70B-Instruct-FP8' + # tp-list: '[2]' + # timeout: ${{ inputs.timeout }} bmk-b200: needs: find-latest-image @@ -75,43 +75,96 @@ jobs: runner: b200 image: 'kedarpotdar147/vllm0.1:latest' model: 'nvidia/Llama-3.1-70B-Instruct-FP8' - tp-list: '[1, 2, 4, 8]' + tp-list: '[2]' timeout: ${{ inputs.timeout }} - bmk-mi300x: + # bmk-mi300x: + # needs: find-latest-image + # uses: ./.github/workflows/benchmark-tmpl.yml + # secrets: inherit + # with: + # exp-name: ${{ inputs.exp-name }} + # isl: ${{ inputs.isl }} + # osl: ${{ inputs.osl }} + # max-model-len: ${{ inputs.max-model-len }} + # random-range-ratio: ${{ inputs.random-range-ratio }} + # runner: mi300x + # image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718' + # model: 'amd/Llama-3.1-70B-Instruct-FP8-KV' + # tp-list: '[1, 2, 4, 8]' + # timeout: ${{ inputs.timeout }} + + # bmk-mi325x: + # needs: find-latest-image + # uses: ./.github/workflows/benchmark-tmpl.yml + # secrets: inherit + # with: + # exp-name: ${{ inputs.exp-name }} + # isl: ${{ inputs.isl }} + # osl: ${{ inputs.osl }} + # max-model-len: ${{ inputs.max-model-len }} + # random-range-ratio: ${{ inputs.random-range-ratio }} + # runner: mi325x + # image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718' + # model: 'amd/Llama-3.1-70B-Instruct-FP8-KV' + # tp-list: '[1, 2, 4, 8]' + # timeout: ${{ inputs.timeout }} + + # TRT-LLM jobs + # bmk-b200-trt: + # needs: find-latest-image + # uses: ./.github/workflows/benchmark-tmpl.yml + # secrets: inherit + # with: + # exp-name: 70b-trt + # isl: ${{ inputs.isl }} + # osl: ${{ inputs.osl }} + # max-model-len: ${{ inputs.max-model-len }} + # random-range-ratio: ${{ inputs.random-range-ratio }} + # runner: b200 + # image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc1' + # model: 'nvidia/Llama-3.3-70B-Instruct-FP8' + # tp-list: '[2]' + # precision: 'fp8' + # timeout: ${{ inputs.timeout }} + + bmk-h200-trt: needs: find-latest-image uses: ./.github/workflows/benchmark-tmpl.yml secrets: inherit with: - exp-name: ${{ inputs.exp-name }} + exp-name: 70b-trt isl: ${{ inputs.isl }} osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - runner: mi300x - image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718' - model: 'amd/Llama-3.1-70B-Instruct-FP8-KV' - tp-list: '[1, 2, 4, 8]' + runner: h200 + image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc1' + model: 'nvidia/Llama-3.3-70B-Instruct-FP8' + tp-list: '[2]' + precision: 'fp8' timeout: ${{ inputs.timeout }} - bmk-mi325x: + bmk-b200-trt-fp4: needs: find-latest-image uses: ./.github/workflows/benchmark-tmpl.yml secrets: inherit with: - exp-name: ${{ inputs.exp-name }} + exp-name: 70b-trt isl: ${{ inputs.isl }} osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - runner: mi325x - image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718' - model: 'amd/Llama-3.1-70B-Instruct-FP8-KV' - tp-list: '[1, 2, 4, 8]' + runner: b200 + image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc1' + model: 'nvidia/Llama-3.3-70B-Instruct-FP4' + tp-list: '[2]' + precision: 'fp4' timeout: ${{ inputs.timeout }} + collect-results: - needs: [bmk-h100, bmk-h200, bmk-b200, bmk-mi300x, bmk-mi325x] + needs: [bmk-b200, bmk-h200-trt, bmk-b200-trt-fp4] if: ${{ always() && !cancelled() }} uses: ./.github/workflows/collect-results.yml secrets: inherit diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 7e4e0b708..98f2543b4 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -29,6 +29,10 @@ on: tp-list: required: true type: string + precision: + required: false + type: string + default: 'fp8' timeout: required: true type: number @@ -43,6 +47,8 @@ env: MAX_MODEL_LEN: ${{ inputs.max-model-len }} RANDOM_RANGE_RATIO: ${{ inputs.random-range-ratio }} IMAGE: ${{ inputs.image }} + RUNNER_LABEL: ${{ inputs.runner }} + PRECISION: ${{ inputs.precision }} jobs: benchmark: @@ -53,7 +59,7 @@ jobs: fail-fast: false matrix: tp: ${{ fromJson(inputs.tp-list) }} - conc: [4, 8, 16, 32, 64] + conc: [4] name: '${{ inputs.runner }} (tp${{ matrix.tp }} , conc${{ matrix.conc }})' env: @@ -68,7 +74,7 @@ jobs: - name: Set result filename run: | - RESULT_FILENAME=${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ runner.name }} + RESULT_FILENAME=${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ inputs.runner }} echo "RESULT_FILENAME=${RESULT_FILENAME}" >> $GITHUB_ENV - name: Launch job script @@ -77,10 +83,22 @@ jobs: bash ./runners/launch_${RUNNER_NAME%%_*}.sh ${{ inputs.exp-name }} - name: Process result - run: python3 utils/process_result.py ${{ inputs.runner }} ${{ env.TP }} ${{ env.RESULT_FILENAME }} + run: | + RESULT_FILENAME=${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ inputs.runner }} + # Determine framework based on image + if [[ "${{ inputs.image }}" == *"tensorrt-llm"* ]]; then + FRAMEWORK="TRT-LLM" + elif [[ "${{ inputs.image }}" == *"vllm"* ]]; then + FRAMEWORK="vLLM" + elif [[ "${{ inputs.image }}" == *"sglang"* ]]; then + FRAMEWORK="SGLang" + else + FRAMEWORK="${{ inputs.runner }}" + fi + python3 utils/process_result.py $FRAMEWORK ${{ env.TP }} $RESULT_FILENAME ${{ env.PRECISION }} - name: Upload result uses: actions/upload-artifact@v4 with: - name: ${{ env.RESULT_FILENAME }} - path: agg_${{ env.RESULT_FILENAME }}.json + name: ${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ env.PRECISION }}_${{ runner.name }} + path: agg_${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ inputs.runner }}.json diff --git a/.github/workflows/cluster-cleanup.yml b/.github/workflows/cluster-cleanup.yml index e0f30ae17..373794a69 100644 --- a/.github/workflows/cluster-cleanup.yml +++ b/.github/workflows/cluster-cleanup.yml @@ -24,7 +24,7 @@ jobs: - 'h200-nv_2' - 'h200-nv_3' - 'b200-nv_0' - - 'b200-nv_1' + - 'b200-nv_1' - 'mi325x-tw_0' - 'mi325x-tw_1' - 'mi325x-tw_2' @@ -47,7 +47,7 @@ jobs: runner: - 'h100-cr_0' - 'h100-cr_1' - - 'b200-tg_0' + # - 'b200-tg_0' - 'mi300x-cr_0' - 'mi300x-amd_0' - 'mi300x-amd_1' diff --git a/.github/workflows/collect-results.yml b/.github/workflows/collect-results.yml index 8924facb0..c98715e4d 100644 --- a/.github/workflows/collect-results.yml +++ b/.github/workflows/collect-results.yml @@ -22,7 +22,13 @@ jobs: uses: actions/download-artifact@v4 with: path: results/ - pattern: ${{ inputs.exp-name }}_* + pattern: ${{ inputs.exp-name }}* + + - name: Download TRT artifacts + uses: actions/download-artifact@v4 + with: + path: results/ + pattern: 70b-trt* - name: Print summary run: python3 utils/summarize.py results/ ${{ inputs.exp-name }} >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/workflow-scheduler.yml b/.github/workflows/workflow-scheduler.yml index ce03740fc..de673c2a5 100644 --- a/.github/workflows/workflow-scheduler.yml +++ b/.github/workflows/workflow-scheduler.yml @@ -23,59 +23,61 @@ jobs: osl: 1024 max-model-len: 2048 random-range-ratio: 0.8 - - dsr1-1k1k: - needs: cleanup - uses: ./.github/workflows/dsr1-tmpl.yml - secrets: inherit - with: - exp-name: 'dsr1_1k1k' - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - _70b-8k1k: - needs: cleanup - uses: ./.github/workflows/70b-tmpl.yml - secrets: inherit - with: - exp-name: '70b_8k1k' - isl: 8192 - osl: 1024 - max-model-len: 9216 - random-range-ratio: 0.8 + + # dsr1-1k1k: + # needs: cleanup + # uses: ./.github/workflows/dsr1-tmpl.yml + # secrets: inherit + # with: + # exp-name: 'dsr1_1k1k' + # isl: 1024 + # osl: 1024 + # max-model-len: 2048 + # random-range-ratio: 0.8 + + # _70b-8k1k: + # needs: cleanup + # uses: ./.github/workflows/70b-tmpl.yml + # secrets: inherit + # with: + # exp-name: '70b_8k1k' + # isl: 8192 + # osl: 1024 + # max-model-len: 9216 + # random-range-ratio: 0.8 + - dsr1-8k1k: - needs: cleanup - uses: ./.github/workflows/dsr1-tmpl.yml - secrets: inherit - with: - exp-name: 'dsr1_8k1k' - isl: 8192 - osl: 1024 - max-model-len: 9216 - random-range-ratio: 0.8 + # dsr1-8k1k: + # needs: cleanup + # uses: ./.github/workflows/dsr1-tmpl.yml + # secrets: inherit + # with: + # exp-name: 'dsr1_8k1k' + # isl: 8192 + # osl: 1024 + # max-model-len: 9216 + # random-range-ratio: 0.8 - _70b-1k8k: - needs: cleanup - uses: ./.github/workflows/70b-tmpl.yml - secrets: inherit - with: - exp-name: '70b_1k8k' - isl: 1024 - osl: 8192 - max-model-len: 9216 - random-range-ratio: 0.8 - timeout: 240 + # _70b-1k8k: + # needs: cleanup + # uses: ./.github/workflows/70b-tmpl.yml + # secrets: inherit + # with: + # exp-name: '70b_1k8k' + # isl: 1024 + # osl: 8192 + # max-model-len: 9216 + # random-range-ratio: 0.8 + # timeout: 240 - dsr1-1k8k: - needs: cleanup - uses: ./.github/workflows/dsr1-tmpl.yml - secrets: inherit - with: - exp-name: 'dsr1_1k8k' - isl: 1024 - osl: 8192 - max-model-len: 9216 - random-range-ratio: 0.8 + # dsr1-1k8k: + # needs: cleanup + # uses: ./.github/workflows/dsr1-tmpl.yml + # secrets: inherit + # with: + # exp-name: 'dsr1_1k8k' + # isl: 1024 + # osl: 8192 + # max-model-len: 9216 + # random-range-ratio: 0.8 diff --git a/benchmarks/70b-trt_b200_slurm.sh b/benchmarks/70b-trt_b200_slurm.sh new file mode 100644 index 000000000..5f91bb2e2 --- /dev/null +++ b/benchmarks/70b-trt_b200_slurm.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash + +# === Required Env Vars === +# HF_TOKEN +# HF_HUB_CACHE +# IMAGE +# MODEL +# ISL +# OSL +# MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# TP +# CONC +# RESULT_FILENAME +# PORT_OFFSET + +echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" + +set -x +hf download $MODEL +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +PORT=$(( 8888 + $PORT_OFFSET )) + +# Create llama-config.yml inline +cat > llama-config.yml << 'EOF' +enable_attention_dp: false +cuda_graph_config: + enable_padding: true + max_batch_size: 1024 +kv_cache_config: + dtype: fp8 + enable_block_reuse: false +stream_interval: 4 +EOF + +mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens $MAX_MODEL_LEN --num_postprocess_workers 2 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 & + +set +x +while IFS= read -r line; do + printf '%s\n' "$line" + if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then + sleep 5 + tail -n100 $SERVER_LOG + echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME" + exit 1 + fi + if [[ "$line" == *"Application startup complete"* ]]; then + break + fi +done < <(tail -F -n0 "$SERVER_LOG") + +set -x +git clone https://github.com/kimbochen/bench_serving.git +python3 bench_serving/benchmark_serving.py \ +--model $MODEL --backend openai \ +--base-url http://0.0.0.0:$PORT \ +--dataset-name random \ +--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ +--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ +--request-rate inf --ignore-eos \ +--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ +--result-dir /workspace/ \ +--result-filename $RESULT_FILENAME.json diff --git a/benchmarks/70b-trt_h200_slurm.sh b/benchmarks/70b-trt_h200_slurm.sh new file mode 100644 index 000000000..5f91bb2e2 --- /dev/null +++ b/benchmarks/70b-trt_h200_slurm.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash + +# === Required Env Vars === +# HF_TOKEN +# HF_HUB_CACHE +# IMAGE +# MODEL +# ISL +# OSL +# MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# TP +# CONC +# RESULT_FILENAME +# PORT_OFFSET + +echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" + +set -x +hf download $MODEL +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +PORT=$(( 8888 + $PORT_OFFSET )) + +# Create llama-config.yml inline +cat > llama-config.yml << 'EOF' +enable_attention_dp: false +cuda_graph_config: + enable_padding: true + max_batch_size: 1024 +kv_cache_config: + dtype: fp8 + enable_block_reuse: false +stream_interval: 4 +EOF + +mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens $MAX_MODEL_LEN --num_postprocess_workers 2 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 & + +set +x +while IFS= read -r line; do + printf '%s\n' "$line" + if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then + sleep 5 + tail -n100 $SERVER_LOG + echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME" + exit 1 + fi + if [[ "$line" == *"Application startup complete"* ]]; then + break + fi +done < <(tail -F -n0 "$SERVER_LOG") + +set -x +git clone https://github.com/kimbochen/bench_serving.git +python3 bench_serving/benchmark_serving.py \ +--model $MODEL --backend openai \ +--base-url http://0.0.0.0:$PORT \ +--dataset-name random \ +--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ +--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ +--request-rate inf --ignore-eos \ +--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ +--result-dir /workspace/ \ +--result-filename $RESULT_FILENAME.json diff --git a/benchmarks/70b_b200_docker.sh b/benchmarks/70b_b200_docker.sh index 27e20c770..da933f4cf 100644 --- a/benchmarks/70b_b200_docker.sh +++ b/benchmarks/70b_b200_docker.sh @@ -29,6 +29,9 @@ port=8888 docker network create $network_name set -x + +pip uninstall -y nvidia-nccl-cu12 +pip install nvidia-nccl-cu12==2.26.2.post1 docker run --rm -d --network $network_name --name $server_name \ --runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ -v $HF_HOME_DIR/hf_hub_cache/:$HF_HUB_CACHE \ diff --git a/benchmarks/70b_b200_slurm.sh b/benchmarks/70b_b200_slurm.sh index fd444abab..9a0ac2558 100644 --- a/benchmarks/70b_b200_slurm.sh +++ b/benchmarks/70b_b200_slurm.sh @@ -21,6 +21,9 @@ hf download $MODEL SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=$(( 8888 + $PORT_OFFSET )) +pip uninstall -y nvidia-nccl-cu12 +pip install nvidia-nccl-cu12==2.26.2.post1 + export TORCH_CUDA_ARCH_LIST="10.0" vllm serve $MODEL --host 0.0.0.0 --port $PORT \ --trust-remote-code --quantization modelopt --kv-cache-dtype fp8 --gpu-memory-utilization 0.9 \ diff --git a/runners/launch_b200-nv.sh b/runners/launch_b200-nv.sh index 83f1ec801..21ec5c35e 100644 --- a/runners/launch_b200-nv.sh +++ b/runners/launch_b200-nv.sh @@ -5,10 +5,10 @@ export PORT_OFFSET=${USER: -1} MODEL_CODE="${1%%_*}" PARTITION="dgx-b200" -SQUASH_FILE="/raid/image_${MODEL_CODE}_b200.sqsh" +SQUASH_FILE="/raid/image_${MODEL_CODE}_${RUNNER_LABEL}-2.sqsh" salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell -JOB_ID=$(squeue -u $USER -h -o %A) +JOB_ID=$(squeue -u $USER -h -o %A | tail -1) set -x srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" @@ -18,6 +18,6 @@ srun --jobid=$JOB_ID \ --container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ -bash benchmarks/${MODEL_CODE}_b200_slurm.sh +bash benchmarks/${MODEL_CODE}_${RUNNER_LABEL}_slurm.sh scancel $JOB_ID diff --git a/runners/launch_h100-cw.sh b/runners/launch_h100-cw.sh index 570790e0b..f39c2f8b0 100644 --- a/runners/launch_h100-cw.sh +++ b/runners/launch_h100-cw.sh @@ -5,7 +5,7 @@ export HF_HUB_CACHE_MOUNT="/mnt/vast/hf_hub_cache/" export PORT_OFFSET=${USER: -1} PARTITION="h100" -SQUASH_FILE="/mnt/vast/squash/image_${MODEL_CODE}_h100.sqsh" +SQUASH_FILE="/mnt/vast/squash/image_${MODEL_CODE}_h100-2.sqsh" salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell JOB_ID=$(squeue -u $USER -h -o %A) diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh index 3245cb379..1329fd4f7 100644 --- a/runners/launch_h200-cw.sh +++ b/runners/launch_h200-cw.sh @@ -5,10 +5,10 @@ export HF_HUB_CACHE_MOUNT="/mnt/vast/hf_hub_cache/" export PORT_OFFSET=${USER: -1} PARTITION="h200" -SQUASH_FILE="/mnt/vast/squash/image_${MODEL_CODE}_h200.sqsh" +SQUASH_FILE="/mnt/vast/squash/image_${MODEL_CODE}_h200-2.sqsh" salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell -JOB_ID=$(squeue -u $USER -h -o %A) +JOB_ID=$(squeue -u $USER -h -o %A | tail -1) set -x srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" diff --git a/runners/launch_h200-nb.sh b/runners/launch_h200-nb.sh index 028cf8033..7d4dbd2df 100644 --- a/runners/launch_h200-nb.sh +++ b/runners/launch_h200-nb.sh @@ -5,7 +5,7 @@ export HF_HUB_CACHE_MOUNT="/home/hf_hub_cache/" export PORT_OFFSET=${USER: -1} PARTITION="main" -SQUASH_FILE="/home/squash/image_${MODEL_CODE}_h200.sqsh" +SQUASH_FILE="/home/squash/image_${MODEL_CODE}_h200-2.sqsh" salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell JOB_ID=$(squeue -u $USER -h -o %A) diff --git a/runners/launch_h200-nv.sh b/runners/launch_h200-nv.sh index 4bedf9b71..b5b2d7df5 100644 --- a/runners/launch_h200-nv.sh +++ b/runners/launch_h200-nv.sh @@ -5,10 +5,10 @@ export HF_HUB_CACHE_MOUNT="/raid/hf_hub_cache/" export PORT_OFFSET=${USER: -1} PARTITION="dgx-h200" -SQUASH_FILE="/raid/image_${MODEL_CODE}_h200.sqsh" +SQUASH_FILE="/raid/image_${MODEL_CODE}_${RUNNER_LABEL}-2.sqsh" salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell -JOB_ID=$(squeue -u $USER -h -o %A) +JOB_ID=$(squeue -u $USER -h -o %A | tail -1) set -x srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" @@ -18,6 +18,6 @@ srun --jobid=$JOB_ID \ --container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ -bash benchmarks/${MODEL_CODE}_h200_slurm.sh +bash benchmarks/${MODEL_CODE}_${RUNNER_LABEL}_slurm.sh scancel $JOB_ID diff --git a/utils/plot_perf.py b/utils/plot_perf.py index 35eb46eb2..a7811ea0e 100644 --- a/utils/plot_perf.py +++ b/utils/plot_perf.py @@ -8,7 +8,7 @@ exp_name = sys.argv[2] hw_color = { 'h100': 'lightgreen', - 'h200': 'green', + 'h200': 'darkgreen', 'b200': 'black', 'mi300x': 'pink', 'mi325x': 'red', @@ -25,15 +25,25 @@ def plot_tput_vs_e2el(): fig, ax = plt.subplots() - for hw, color in hw_color.items(): - xs = [result['median_e2el'] for result in results if result['hw'] == hw] - ys = [result['tput_per_gpu'] for result in results if result['hw'] == hw] - if xs and ys: - ax.scatter(xs, ys, label=hw.upper(), color=color) + # Group by hardware, framework, and precision + for hw in set(result['hw'] for result in results): + for framework in set(result.get('framework', 'vLLM') for result in results if result['hw'] == hw): + for precision in set(result.get('precision', 'fp8') for result in results if result['hw'] == hw and result.get('framework', 'vLLM') == framework): + xs = [result.get('e2el', result.get('median_e2el', 0)) for result in results if result['hw'] == hw and result.get('framework', 'vLLM') == framework and result.get('precision', 'fp8') == precision] + ys = [result['tput_per_gpu'] for result in results if result['hw'] == hw and result.get('framework', 'vLLM') == framework and result.get('precision', 'fp8') == precision] + if xs and ys: + # Only add framework label for TRT-LLM, keep vLLM simple + if framework == 'TRT-LLM': + label = f"{hw.upper()}-TRT-{precision.upper()}" + else: + label = f"{hw.upper()}-{precision.upper()}" + color = hw_color.get(hw.lower(), 'blue') + ax.scatter(xs, ys, label=label, color=color, alpha=0.7) for result in results: - x, y = result['median_e2el'], result['tput_per_gpu'] - ax.annotate(str(result['tp']), (x, y), textcoords='offset points', xytext=(3, 3), ha='left', fontsize=8) + x = result.get('e2el', result.get('median_e2el', 0)) + y = result['tput_per_gpu'] + ax.annotate(f"{result['tp']}-{result.get('precision', 'fp8').upper()}", (x, y), textcoords='offset points', xytext=(3, 3), ha='left', fontsize=8) ax.set_xlabel('End-to-end Latency (s)') ax.set_ylabel('Throughput per GPU (tok/s)') @@ -47,15 +57,25 @@ def plot_tput_vs_e2el(): def plot_tput_vs_intvty(): fig, ax = plt.subplots() - for hw, color in hw_color.items(): - xs = [result['median_intvty'] for result in results if result['hw'] == hw] - ys = [result['tput_per_gpu'] for result in results if result['hw'] == hw] - if xs and ys: - ax.scatter(xs, ys, label=hw.upper(), color=color) + # Group by hardware, framework, and precision + for hw in set(result['hw'] for result in results): + for framework in set(result.get('framework', 'vLLM') for result in results if result['hw'] == hw): + for precision in set(result.get('precision', 'fp8') for result in results if result['hw'] == hw and result.get('framework', 'vLLM') == framework): + xs = [result.get('intvty', result.get('median_intvty', 0)) for result in results if result['hw'] == hw and result.get('framework', 'vLLM') == framework and result.get('precision', 'fp8') == precision] + ys = [result['tput_per_gpu'] for result in results if result['hw'] == hw and result.get('framework', 'vLLM') == framework and result.get('precision', 'fp8') == precision] + if xs and ys: + # Only add framework label for TRT-LLM, keep vLLM simple + if framework == 'TRT-LLM': + label = f"{hw.upper()}-TRT-{precision.upper()}" + else: + label = f"{hw.upper()}-{precision.upper()}" + color = hw_color.get(hw.lower(), 'blue') + ax.scatter(xs, ys, label=label, color=color, alpha=0.7) for result in results: - x, y = result['median_intvty'], result['tput_per_gpu'] - ax.annotate(str(result['tp']), (x, y), textcoords='offset points', xytext=(3, 3), ha='left', fontsize=8) + x = result.get('intvty', result.get('median_intvty', 0)) + y = result['tput_per_gpu'] + ax.annotate(f"{result['tp']}-{result.get('precision', 'fp8').upper()}", (x, y), textcoords='offset points', xytext=(3, 3), ha='left', fontsize=8) ax.set_xlabel('Interactivity (tok/s/user)') ax.set_ylabel('Throughput per GPU (tok/s)') diff --git a/utils/process_result.py b/utils/process_result.py index d0f0ef000..76f1b8541 100644 --- a/utils/process_result.py +++ b/utils/process_result.py @@ -3,15 +3,29 @@ from pathlib import Path -hw = sys.argv[1] +framework = sys.argv[1] # First arg is the framework (TRT-LLM, vLLM, SGLang, etc.) tp_size = int(sys.argv[2]) result_filename = sys.argv[3] +precision = sys.argv[4] if len(sys.argv) > 4 else 'fp8' # Fourth arg is precision, default to fp8 with open(f'{result_filename}.json') as f: bmk_result = json.load(f) +# Extract hardware from result filename or runner name +# Result filename format: {exp-name}_tp{tp}_conc{conc}_{runner} +# We need to extract the hardware type from the runner +result_parts = result_filename.split('_') +if len(result_parts) >= 4: + runner_part = result_parts[-1] # Last part is the runner + # Extract hardware type (e.g., 'b200' from 'b200-nv_0') + hw = runner_part.split('-')[0].upper() # Convert to uppercase for consistency +else: + hw = "UNKNOWN" + data = { - 'hw': hw, + 'hw': hw, # Hardware (B200, H200, etc.) + 'framework': framework, # Framework (TRT-LLM, vLLM, SGLang, etc.) + 'precision': precision, # Precision (fp8, fp4, etc.) 'tp': tp_size, 'conc': int(bmk_result['max_concurrency']), 'model': bmk_result['model_id'], diff --git a/utils/summarize.py b/utils/summarize.py index 20d9ae127..50ea6e07a 100644 --- a/utils/summarize.py +++ b/utils/summarize.py @@ -12,18 +12,42 @@ results.sort(key=lambda r: (r['hw'], r['tp'], r['conc'])) summary_header = f'''\ -| Hardware | TP | Conc | TTFT (ms) | TPOT (ms) | E2EL (s) | TPUT per GPU | -| :-: | :-: | :-: | :-: | :-: | :-: | :-: |\ +| Hardware | Framework | Precision | TP | Conc | TTFT (ms) | TPOT (ms) | E2EL (s) | TPUT per GPU | +| :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |\ ''' print(summary_header) for result in results: + # Extract framework - prefer explicit framework field, fallback to detection + framework = result.get('framework', 'vLLM') # default to vLLM if not specified + + # If no explicit framework field, try to detect from other fields + if framework == 'vLLM': + exp_name = result.get('exp_name', '') + runner = result.get('runner', '') + + # Check for TRT-LLM indicators + if ('trt' in exp_name.lower() or 'trt' in runner.lower() or + 'trt-llm' in exp_name.lower() or 'trt-llm' in runner.lower() or + 'tensorrt' in exp_name.lower() or 'tensorrt' in runner.lower()): + framework = 'TRT-LLM' + + # Get precision, default to 'fp8' if not present + precision = result.get('precision', 'fp8') + + # Get metrics with fallbacks for missing fields + ttft = result.get('ttft', result.get('median_ttft', 0)) + tpot = result.get('tpot', result.get('median_tpot', 0)) + e2el = result.get('e2el', result.get('median_e2el', 0)) + print( f"| {result['hw'].upper()} " + f"| {framework} " + f"| {precision.upper()} " f"| {result['tp']} " f"| {result['conc']} " - f"| {(result['median_ttft'] * 1000):.4f} " - f"| {(result['median_tpot'] * 1000):.4f} " - f"| {result['median_e2el']:.4f} " + f"| {(ttft * 1000):.4f} " + f"| {(tpot * 1000):.4f} " + f"| {e2el:.4f} " f"| {result['tput_per_gpu']:.4f} |" )