diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml index 23ad88551..ff70adcca 100644 --- a/.github/workflows/70b-tmpl.yml +++ b/.github/workflows/70b-tmpl.yml @@ -59,7 +59,7 @@ jobs: runner: h200 image: 'kedarpotdar147/vllm0.1:latest' model: 'nvidia/Llama-3.1-70B-Instruct-FP8' - tp-list: '[1, 2, 4, 8]' + tp-list: '[2]' timeout: ${{ inputs.timeout }} bmk-b200: @@ -75,7 +75,7 @@ jobs: runner: b200 image: 'kedarpotdar147/vllm0.1:latest' model: 'nvidia/Llama-3.1-70B-Instruct-FP8' - tp-list: '[1, 2, 4, 8]' + tp-list: '[2]' timeout: ${{ inputs.timeout }} bmk-mi300x: diff --git a/.github/workflows/70b-trt-tmpl.yml b/.github/workflows/70b-trt-tmpl.yml new file mode 100644 index 000000000..40cf27f38 --- /dev/null +++ b/.github/workflows/70b-trt-tmpl.yml @@ -0,0 +1,71 @@ +name: LLaMA 70B TRT-LLM Template + +on: + workflow_call: + inputs: + exp-name: + required: true + type: string + isl: + required: true + type: string + osl: + required: true + type: string + max-model-len: + required: true + type: string + random-range-ratio: + required: true + type: string + timeout: + required: false + type: number + default: 180 + +jobs: + find-latest-image: + runs-on: ubuntu-latest + steps: + - name: Find the latest Docker image + run: echo "Hardcoding image tags for now." + + bmk-b200-trt: + needs: find-latest-image + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + exp-name: ${{ inputs.exp-name }} + isl: ${{ inputs.isl }} + osl: ${{ inputs.osl }} + max-model-len: ${{ inputs.max-model-len }} + random-range-ratio: ${{ inputs.random-range-ratio }} + runner: b200 + image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc1' + model: 'nvidia/Llama-3.3-70B-Instruct-FP8' + tp-list: '[2]' + timeout: ${{ inputs.timeout }} + + bmk-h200-trt: + needs: find-latest-image + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + exp-name: ${{ inputs.exp-name }} + isl: ${{ inputs.isl }} + osl: ${{ inputs.osl }} + max-model-len: ${{ inputs.max-model-len }} + random-range-ratio: ${{ inputs.random-range-ratio }} + runner: h200 + image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc1' + model: 'nvidia/Llama-3.3-70B-Instruct-FP8' + tp-list: '[2]' + timeout: ${{ inputs.timeout }} + + collect-results: + needs: [bmk-b200-trt, bmk-h200-trt] + if: ${{ always() && !cancelled() }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: ${{ inputs.exp-name }} diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 7e4e0b708..c1e89d80a 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -43,6 +43,7 @@ env: MAX_MODEL_LEN: ${{ inputs.max-model-len }} RANDOM_RANGE_RATIO: ${{ inputs.random-range-ratio }} IMAGE: ${{ inputs.image }} + RUNNER_LABEL: ${{ inputs.runner }} jobs: benchmark: @@ -53,7 +54,7 @@ jobs: fail-fast: false matrix: tp: ${{ fromJson(inputs.tp-list) }} - conc: [4, 8, 16, 32, 64] + conc: [4] name: '${{ inputs.runner }} (tp${{ matrix.tp }} , conc${{ matrix.conc }})' env: @@ -68,7 +69,7 @@ jobs: - name: Set result filename run: | - RESULT_FILENAME=${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ runner.name }} + RESULT_FILENAME=${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ inputs.runner }} echo "RESULT_FILENAME=${RESULT_FILENAME}" >> $GITHUB_ENV - name: Launch job script @@ -77,10 +78,22 @@ jobs: bash ./runners/launch_${RUNNER_NAME%%_*}.sh ${{ inputs.exp-name }} - name: Process result - run: python3 utils/process_result.py ${{ inputs.runner }} ${{ env.TP }} ${{ env.RESULT_FILENAME }} + run: | + RESULT_FILENAME=${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ inputs.runner }} + # Determine framework based on image + if [[ "${{ inputs.image }}" == *"tensorrt-llm"* ]]; then + FRAMEWORK="TRT-LLM" + elif [[ "${{ inputs.image }}" == *"vllm"* ]]; then + FRAMEWORK="vLLM" + elif [[ "${{ inputs.image }}" == *"sglang"* ]]; then + FRAMEWORK="SGLang" + else + FRAMEWORK="${{ inputs.runner }}" + fi + python3 utils/process_result.py $FRAMEWORK ${{ env.TP }} $RESULT_FILENAME - name: Upload result uses: actions/upload-artifact@v4 with: - name: ${{ env.RESULT_FILENAME }} - path: agg_${{ env.RESULT_FILENAME }}.json + name: ${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ runner.name }} + path: agg_${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ inputs.runner }}.json diff --git a/.github/workflows/cluster-cleanup.yml b/.github/workflows/cluster-cleanup.yml index e0f30ae17..a74311d9f 100644 --- a/.github/workflows/cluster-cleanup.yml +++ b/.github/workflows/cluster-cleanup.yml @@ -47,7 +47,7 @@ jobs: runner: - 'h100-cr_0' - 'h100-cr_1' - - 'b200-tg_0' + # - 'b200-tg_0' - 'mi300x-cr_0' - 'mi300x-amd_0' - 'mi300x-amd_1' diff --git a/.github/workflows/workflow-scheduler.yml b/.github/workflows/workflow-scheduler.yml index ce03740fc..c65335569 100644 --- a/.github/workflows/workflow-scheduler.yml +++ b/.github/workflows/workflow-scheduler.yml @@ -13,69 +13,103 @@ jobs: cleanup: uses: ./.github/workflows/cluster-cleanup.yml - _70b-1k1k: + # _70b-1k1k: + # needs: cleanup + # uses: ./.github/workflows/70b-tmpl.yml + # secrets: inherit + # with: + # exp-name: '70b_1k1k' + # isl: 1024 + # osl: 1024 + # max-model-len: 2048 + # random-range-ratio: 0.8 + + _70b-trt-1k1k: needs: cleanup - uses: ./.github/workflows/70b-tmpl.yml + uses: ./.github/workflows/70b-trt-tmpl.yml secrets: inherit with: - exp-name: '70b_1k1k' + exp-name: '70b-trt_1k1k' isl: 1024 osl: 1024 max-model-len: 2048 random-range-ratio: 0.8 - dsr1-1k1k: - needs: cleanup - uses: ./.github/workflows/dsr1-tmpl.yml - secrets: inherit - with: - exp-name: 'dsr1_1k1k' - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 + # dsr1-1k1k: + # needs: cleanup + # uses: ./.github/workflows/dsr1-tmpl.yml + # secrets: inherit + # with: + # exp-name: 'dsr1_1k1k' + # isl: 1024 + # osl: 1024 + # max-model-len: 2048 + # random-range-ratio: 0.8 - _70b-8k1k: - needs: cleanup - uses: ./.github/workflows/70b-tmpl.yml - secrets: inherit - with: - exp-name: '70b_8k1k' - isl: 8192 - osl: 1024 - max-model-len: 9216 - random-range-ratio: 0.8 + # _70b-8k1k: + # needs: cleanup + # uses: ./.github/workflows/70b-tmpl.yml + # secrets: inherit + # with: + # exp-name: '70b_8k1k' + # isl: 8192 + # osl: 1024 + # max-model-len: 9216 + # random-range-ratio: 0.8 + + # _70b-trt-8k1k: + # needs: cleanup + # uses: ./.github/workflows/70b-trt-tmpl.yml + # secrets: inherit + # with: + # exp-name: '70b-trt_8k1k' + # isl: 8192 + # osl: 1024 + # max-model-len: 9216 + # random-range-ratio: 0.8 - dsr1-8k1k: - needs: cleanup - uses: ./.github/workflows/dsr1-tmpl.yml - secrets: inherit - with: - exp-name: 'dsr1_8k1k' - isl: 8192 - osl: 1024 - max-model-len: 9216 - random-range-ratio: 0.8 + # dsr1-8k1k: + # needs: cleanup + # uses: ./.github/workflows/dsr1-tmpl.yml + # secrets: inherit + # with: + # exp-name: 'dsr1_8k1k' + # isl: 8192 + # osl: 1024 + # max-model-len: 9216 + # random-range-ratio: 0.8 - _70b-1k8k: - needs: cleanup - uses: ./.github/workflows/70b-tmpl.yml - secrets: inherit - with: - exp-name: '70b_1k8k' - isl: 1024 - osl: 8192 - max-model-len: 9216 - random-range-ratio: 0.8 - timeout: 240 + # _70b-1k8k: + # needs: cleanup + # uses: ./.github/workflows/70b-tmpl.yml + # secrets: inherit + # with: + # exp-name: '70b_1k8k' + # isl: 1024 + # osl: 8192 + # max-model-len: 9216 + # random-range-ratio: 0.8 + # timeout: 240 - dsr1-1k8k: - needs: cleanup - uses: ./.github/workflows/dsr1-tmpl.yml - secrets: inherit - with: - exp-name: 'dsr1_1k8k' - isl: 1024 - osl: 8192 - max-model-len: 9216 - random-range-ratio: 0.8 + # _70b-trt-1k8k: + # needs: cleanup + # uses: ./.github/workflows/70b-trt-tmpl.yml + # secrets: inherit + # with: + # exp-name: '70b-trt_1k8k' + # isl: 1024 + # osl: 8192 + # max-model-len: 9216 + # random-range-ratio: 0.8 + # timeout: 240 + + # dsr1-1k8k: + # needs: cleanup + # uses: ./.github/workflows/dsr1-tmpl.yml + # secrets: inherit + # with: + # exp-name: 'dsr1_1k8k' + # isl: 1024 + # osl: 8192 + # max-model-len: 9216 + # random-range-ratio: 0.8 diff --git a/benchmarks/70b-trt_b200_slurm.sh b/benchmarks/70b-trt_b200_slurm.sh new file mode 100644 index 000000000..5f91bb2e2 --- /dev/null +++ b/benchmarks/70b-trt_b200_slurm.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash + +# === Required Env Vars === +# HF_TOKEN +# HF_HUB_CACHE +# IMAGE +# MODEL +# ISL +# OSL +# MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# TP +# CONC +# RESULT_FILENAME +# PORT_OFFSET + +echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" + +set -x +hf download $MODEL +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +PORT=$(( 8888 + $PORT_OFFSET )) + +# Create llama-config.yml inline +cat > llama-config.yml << 'EOF' +enable_attention_dp: false +cuda_graph_config: + enable_padding: true + max_batch_size: 1024 +kv_cache_config: + dtype: fp8 + enable_block_reuse: false +stream_interval: 4 +EOF + +mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens $MAX_MODEL_LEN --num_postprocess_workers 2 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 & + +set +x +while IFS= read -r line; do + printf '%s\n' "$line" + if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then + sleep 5 + tail -n100 $SERVER_LOG + echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME" + exit 1 + fi + if [[ "$line" == *"Application startup complete"* ]]; then + break + fi +done < <(tail -F -n0 "$SERVER_LOG") + +set -x +git clone https://github.com/kimbochen/bench_serving.git +python3 bench_serving/benchmark_serving.py \ +--model $MODEL --backend openai \ +--base-url http://0.0.0.0:$PORT \ +--dataset-name random \ +--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ +--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ +--request-rate inf --ignore-eos \ +--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ +--result-dir /workspace/ \ +--result-filename $RESULT_FILENAME.json diff --git a/benchmarks/70b-trt_h200_slurm.sh b/benchmarks/70b-trt_h200_slurm.sh new file mode 100644 index 000000000..5f91bb2e2 --- /dev/null +++ b/benchmarks/70b-trt_h200_slurm.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash + +# === Required Env Vars === +# HF_TOKEN +# HF_HUB_CACHE +# IMAGE +# MODEL +# ISL +# OSL +# MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# TP +# CONC +# RESULT_FILENAME +# PORT_OFFSET + +echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" + +set -x +hf download $MODEL +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +PORT=$(( 8888 + $PORT_OFFSET )) + +# Create llama-config.yml inline +cat > llama-config.yml << 'EOF' +enable_attention_dp: false +cuda_graph_config: + enable_padding: true + max_batch_size: 1024 +kv_cache_config: + dtype: fp8 + enable_block_reuse: false +stream_interval: 4 +EOF + +mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens $MAX_MODEL_LEN --num_postprocess_workers 2 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 & + +set +x +while IFS= read -r line; do + printf '%s\n' "$line" + if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then + sleep 5 + tail -n100 $SERVER_LOG + echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME" + exit 1 + fi + if [[ "$line" == *"Application startup complete"* ]]; then + break + fi +done < <(tail -F -n0 "$SERVER_LOG") + +set -x +git clone https://github.com/kimbochen/bench_serving.git +python3 bench_serving/benchmark_serving.py \ +--model $MODEL --backend openai \ +--base-url http://0.0.0.0:$PORT \ +--dataset-name random \ +--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ +--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ +--request-rate inf --ignore-eos \ +--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ +--result-dir /workspace/ \ +--result-filename $RESULT_FILENAME.json diff --git a/runners/launch_b200-nv.sh b/runners/launch_b200-nv.sh index 83f1ec801..21ec5c35e 100644 --- a/runners/launch_b200-nv.sh +++ b/runners/launch_b200-nv.sh @@ -5,10 +5,10 @@ export PORT_OFFSET=${USER: -1} MODEL_CODE="${1%%_*}" PARTITION="dgx-b200" -SQUASH_FILE="/raid/image_${MODEL_CODE}_b200.sqsh" +SQUASH_FILE="/raid/image_${MODEL_CODE}_${RUNNER_LABEL}-2.sqsh" salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell -JOB_ID=$(squeue -u $USER -h -o %A) +JOB_ID=$(squeue -u $USER -h -o %A | tail -1) set -x srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" @@ -18,6 +18,6 @@ srun --jobid=$JOB_ID \ --container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ -bash benchmarks/${MODEL_CODE}_b200_slurm.sh +bash benchmarks/${MODEL_CODE}_${RUNNER_LABEL}_slurm.sh scancel $JOB_ID diff --git a/runners/launch_h100-cw.sh b/runners/launch_h100-cw.sh index 570790e0b..f39c2f8b0 100644 --- a/runners/launch_h100-cw.sh +++ b/runners/launch_h100-cw.sh @@ -5,7 +5,7 @@ export HF_HUB_CACHE_MOUNT="/mnt/vast/hf_hub_cache/" export PORT_OFFSET=${USER: -1} PARTITION="h100" -SQUASH_FILE="/mnt/vast/squash/image_${MODEL_CODE}_h100.sqsh" +SQUASH_FILE="/mnt/vast/squash/image_${MODEL_CODE}_h100-2.sqsh" salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell JOB_ID=$(squeue -u $USER -h -o %A) diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh index 3245cb379..1329fd4f7 100644 --- a/runners/launch_h200-cw.sh +++ b/runners/launch_h200-cw.sh @@ -5,10 +5,10 @@ export HF_HUB_CACHE_MOUNT="/mnt/vast/hf_hub_cache/" export PORT_OFFSET=${USER: -1} PARTITION="h200" -SQUASH_FILE="/mnt/vast/squash/image_${MODEL_CODE}_h200.sqsh" +SQUASH_FILE="/mnt/vast/squash/image_${MODEL_CODE}_h200-2.sqsh" salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell -JOB_ID=$(squeue -u $USER -h -o %A) +JOB_ID=$(squeue -u $USER -h -o %A | tail -1) set -x srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" diff --git a/runners/launch_h200-nb.sh b/runners/launch_h200-nb.sh index 028cf8033..7d4dbd2df 100644 --- a/runners/launch_h200-nb.sh +++ b/runners/launch_h200-nb.sh @@ -5,7 +5,7 @@ export HF_HUB_CACHE_MOUNT="/home/hf_hub_cache/" export PORT_OFFSET=${USER: -1} PARTITION="main" -SQUASH_FILE="/home/squash/image_${MODEL_CODE}_h200.sqsh" +SQUASH_FILE="/home/squash/image_${MODEL_CODE}_h200-2.sqsh" salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell JOB_ID=$(squeue -u $USER -h -o %A) diff --git a/runners/launch_h200-nv.sh b/runners/launch_h200-nv.sh index 4bedf9b71..b5b2d7df5 100644 --- a/runners/launch_h200-nv.sh +++ b/runners/launch_h200-nv.sh @@ -5,10 +5,10 @@ export HF_HUB_CACHE_MOUNT="/raid/hf_hub_cache/" export PORT_OFFSET=${USER: -1} PARTITION="dgx-h200" -SQUASH_FILE="/raid/image_${MODEL_CODE}_h200.sqsh" +SQUASH_FILE="/raid/image_${MODEL_CODE}_${RUNNER_LABEL}-2.sqsh" salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell -JOB_ID=$(squeue -u $USER -h -o %A) +JOB_ID=$(squeue -u $USER -h -o %A | tail -1) set -x srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" @@ -18,6 +18,6 @@ srun --jobid=$JOB_ID \ --container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ -bash benchmarks/${MODEL_CODE}_h200_slurm.sh +bash benchmarks/${MODEL_CODE}_${RUNNER_LABEL}_slurm.sh scancel $JOB_ID diff --git a/utils/plot_perf.py b/utils/plot_perf.py index 35eb46eb2..5b2909fe3 100644 --- a/utils/plot_perf.py +++ b/utils/plot_perf.py @@ -9,7 +9,9 @@ hw_color = { 'h100': 'lightgreen', 'h200': 'green', + 'h200-trt': 'darkgreen', 'b200': 'black', + 'b200-trt': 'darkblue', 'mi300x': 'pink', 'mi325x': 'red', 'mi355x': 'purple' diff --git a/utils/process_result.py b/utils/process_result.py index d0f0ef000..e7b697361 100644 --- a/utils/process_result.py +++ b/utils/process_result.py @@ -3,15 +3,27 @@ from pathlib import Path -hw = sys.argv[1] +framework = sys.argv[1] # First arg is the framework (TRT-LLM, vLLM, SGLang, etc.) tp_size = int(sys.argv[2]) result_filename = sys.argv[3] with open(f'{result_filename}.json') as f: bmk_result = json.load(f) +# Extract hardware from result filename or runner name +# Result filename format: {exp-name}_tp{tp}_conc{conc}_{runner} +# We need to extract the hardware type from the runner +result_parts = result_filename.split('_') +if len(result_parts) >= 4: + runner_part = result_parts[-1] # Last part is the runner + # Extract hardware type (e.g., 'b200' from 'b200-nv_0') + hw = runner_part.split('-')[0].upper() # Convert to uppercase for consistency +else: + hw = "UNKNOWN" + data = { - 'hw': hw, + 'hw': hw, # Hardware (B200, H200, etc.) + 'framework': framework, # Framework (TRT-LLM, vLLM, SGLang, etc.) 'tp': tp_size, 'conc': int(bmk_result['max_concurrency']), 'model': bmk_result['model_id'], diff --git a/utils/summarize.py b/utils/summarize.py index 20d9ae127..6c6f9dc43 100644 --- a/utils/summarize.py +++ b/utils/summarize.py @@ -12,14 +12,29 @@ results.sort(key=lambda r: (r['hw'], r['tp'], r['conc'])) summary_header = f'''\ -| Hardware | TP | Conc | TTFT (ms) | TPOT (ms) | E2EL (s) | TPUT per GPU | -| :-: | :-: | :-: | :-: | :-: | :-: | :-: |\ +| Hardware | Framework | TP | Conc | TTFT (ms) | TPOT (ms) | E2EL (s) | TPUT per GPU | +| :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |\ ''' print(summary_header) for result in results: + # Extract framework - prefer explicit framework field, fallback to detection + framework = result.get('framework', 'vLLM') # default to vLLM if not specified + + # If no explicit framework field, try to detect from other fields + if framework == 'vLLM': + exp_name = result.get('exp_name', '') + runner = result.get('runner', '') + + # Check for TRT-LLM indicators + if ('trt' in exp_name.lower() or 'trt' in runner.lower() or + 'trt-llm' in exp_name.lower() or 'trt-llm' in runner.lower() or + 'tensorrt' in exp_name.lower() or 'tensorrt' in runner.lower()): + framework = 'TRT-LLM' + print( f"| {result['hw'].upper()} " + f"| {framework} " f"| {result['tp']} " f"| {result['conc']} " f"| {(result['median_ttft'] * 1000):.4f} "