Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
d556b88
add trt init for 70b
kedarpotdar-nv Aug 28, 2025
426f48e
remove dsr1 and add $MAX_MODEL_LEN to launch configs
kedarpotdar-nv Aug 28, 2025
12a7f6e
remove b200 tg
kedarpotdar-nv Aug 28, 2025
0fc8ab4
add RUNNER LABEL and temporarily remove bmk-b200?
kedarpotdar-nv Aug 28, 2025
4b30c03
fix per kimbo's suggestion
kedarpotdar-nv Aug 28, 2025
aab2320
revert local runner var
kedarpotdar-nv Aug 28, 2025
0c5ad16
update sqsh file name to include runner name. i.e. trt
kedarpotdar-nv Aug 28, 2025
7487baa
temporarily remove other benchmarks. only keep bmk-b200-trt
kedarpotdar-nv Aug 28, 2025
1233b53
refactor scheduler to add trt tag, update ngc image address , update …
kedarpotdar-nv Aug 28, 2025
7800006
refactor trt into separate yml
kedarpotdar-nv Aug 28, 2025
43057dd
fix file name
kedarpotdar-nv Aug 28, 2025
a94fbd0
comment vllm for now
kedarpotdar-nv Aug 28, 2025
0225b10
update port in trtllm-serve
kedarpotdar-nv Aug 28, 2025
1e594f3
update artifact name to have runner name at end
kedarpotdar-nv Aug 28, 2025
f63768c
update plot function with b200-trt
kedarpotdar-nv Aug 29, 2025
ed20d23
add h200 trt
kedarpotdar-nv Aug 29, 2025
25566a9
fix launch slurm script based on runner label
kedarpotdar-nv Aug 29, 2025
d33cda5
better identify if result is vllm or trt
kedarpotdar-nv Aug 29, 2025
de2d8de
clarify runners for trt and vllm
kedarpotdar-nv Aug 29, 2025
80dc11d
fix runner names
kedarpotdar-nv Aug 29, 2025
3cf357b
remove trt runners
kedarpotdar-nv Aug 29, 2025
9d7cbd3
ensure trt runners are correctly tagged
kedarpotdar-nv Aug 29, 2025
a2ed19c
rename launch scripts
kedarpotdar-nv Aug 29, 2025
fd1ff2e
only get latest run id
kedarpotdar-nv Aug 29, 2025
63d11bf
update trtllm image version
kedarpotdar-nv Aug 29, 2025
85a6e51
img ids
kedarpotdar-nv Aug 29, 2025
6c8af51
add fw identifier to benchmark template
kedarpotdar-nv Aug 29, 2025
9946fb8
limit concurrency for now
kedarpotdar-nv Aug 29, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/70b-tmpl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ jobs:
runner: h200
image: 'kedarpotdar147/vllm0.1:latest'
model: 'nvidia/Llama-3.1-70B-Instruct-FP8'
tp-list: '[1, 2, 4, 8]'
tp-list: '[2]'
timeout: ${{ inputs.timeout }}

bmk-b200:
Expand All @@ -75,7 +75,7 @@ jobs:
runner: b200
image: 'kedarpotdar147/vllm0.1:latest'
model: 'nvidia/Llama-3.1-70B-Instruct-FP8'
tp-list: '[1, 2, 4, 8]'
tp-list: '[2]'
timeout: ${{ inputs.timeout }}

bmk-mi300x:
Expand Down
71 changes: 71 additions & 0 deletions .github/workflows/70b-trt-tmpl.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
name: LLaMA 70B TRT-LLM Template

on:
workflow_call:
inputs:
exp-name:
required: true
type: string
isl:
required: true
type: string
osl:
required: true
type: string
max-model-len:
required: true
type: string
random-range-ratio:
required: true
type: string
timeout:
required: false
type: number
default: 180

jobs:
find-latest-image:
runs-on: ubuntu-latest
steps:
- name: Find the latest Docker image
run: echo "Hardcoding image tags for now."

bmk-b200-trt:
needs: find-latest-image
uses: ./.github/workflows/benchmark-tmpl.yml
secrets: inherit
with:
exp-name: ${{ inputs.exp-name }}
isl: ${{ inputs.isl }}
osl: ${{ inputs.osl }}
max-model-len: ${{ inputs.max-model-len }}
random-range-ratio: ${{ inputs.random-range-ratio }}
runner: b200
image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc1'
model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
tp-list: '[2]'
timeout: ${{ inputs.timeout }}

bmk-h200-trt:
needs: find-latest-image
uses: ./.github/workflows/benchmark-tmpl.yml
secrets: inherit
with:
exp-name: ${{ inputs.exp-name }}
isl: ${{ inputs.isl }}
osl: ${{ inputs.osl }}
max-model-len: ${{ inputs.max-model-len }}
random-range-ratio: ${{ inputs.random-range-ratio }}
runner: h200
image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc1'
model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
tp-list: '[2]'
timeout: ${{ inputs.timeout }}

collect-results:
needs: [bmk-b200-trt, bmk-h200-trt]
if: ${{ always() && !cancelled() }}
uses: ./.github/workflows/collect-results.yml
secrets: inherit
with:
exp-name: ${{ inputs.exp-name }}
23 changes: 18 additions & 5 deletions .github/workflows/benchmark-tmpl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ env:
MAX_MODEL_LEN: ${{ inputs.max-model-len }}
RANDOM_RANGE_RATIO: ${{ inputs.random-range-ratio }}
IMAGE: ${{ inputs.image }}
RUNNER_LABEL: ${{ inputs.runner }}

jobs:
benchmark:
Expand All @@ -53,7 +54,7 @@ jobs:
fail-fast: false
matrix:
tp: ${{ fromJson(inputs.tp-list) }}
conc: [4, 8, 16, 32, 64]
conc: [4]
name: '${{ inputs.runner }} (tp${{ matrix.tp }} , conc${{ matrix.conc }})'

env:
Expand All @@ -68,7 +69,7 @@ jobs:

- name: Set result filename
run: |
RESULT_FILENAME=${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ runner.name }}
RESULT_FILENAME=${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ inputs.runner }}
echo "RESULT_FILENAME=${RESULT_FILENAME}" >> $GITHUB_ENV

- name: Launch job script
Expand All @@ -77,10 +78,22 @@ jobs:
bash ./runners/launch_${RUNNER_NAME%%_*}.sh ${{ inputs.exp-name }}

- name: Process result
run: python3 utils/process_result.py ${{ inputs.runner }} ${{ env.TP }} ${{ env.RESULT_FILENAME }}
run: |
RESULT_FILENAME=${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ inputs.runner }}
# Determine framework based on image
if [[ "${{ inputs.image }}" == *"tensorrt-llm"* ]]; then
FRAMEWORK="TRT-LLM"
elif [[ "${{ inputs.image }}" == *"vllm"* ]]; then
FRAMEWORK="vLLM"
elif [[ "${{ inputs.image }}" == *"sglang"* ]]; then
FRAMEWORK="SGLang"
else
FRAMEWORK="${{ inputs.runner }}"
fi
python3 utils/process_result.py $FRAMEWORK ${{ env.TP }} $RESULT_FILENAME

- name: Upload result
uses: actions/upload-artifact@v4
with:
name: ${{ env.RESULT_FILENAME }}
path: agg_${{ env.RESULT_FILENAME }}.json
name: ${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ runner.name }}
path: agg_${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ inputs.runner }}.json
2 changes: 1 addition & 1 deletion .github/workflows/cluster-cleanup.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ jobs:
runner:
- 'h100-cr_0'
- 'h100-cr_1'
- 'b200-tg_0'
# - 'b200-tg_0'
- 'mi300x-cr_0'
- 'mi300x-amd_0'
- 'mi300x-amd_1'
Expand Down
142 changes: 88 additions & 54 deletions .github/workflows/workflow-scheduler.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,69 +13,103 @@ jobs:
cleanup:
uses: ./.github/workflows/cluster-cleanup.yml

_70b-1k1k:
# _70b-1k1k:
# needs: cleanup
# uses: ./.github/workflows/70b-tmpl.yml
# secrets: inherit
# with:
# exp-name: '70b_1k1k'
# isl: 1024
# osl: 1024
# max-model-len: 2048
# random-range-ratio: 0.8

_70b-trt-1k1k:
needs: cleanup
uses: ./.github/workflows/70b-tmpl.yml
uses: ./.github/workflows/70b-trt-tmpl.yml
secrets: inherit
with:
exp-name: '70b_1k1k'
exp-name: '70b-trt_1k1k'
isl: 1024
osl: 1024
max-model-len: 2048
random-range-ratio: 0.8

dsr1-1k1k:
needs: cleanup
uses: ./.github/workflows/dsr1-tmpl.yml
secrets: inherit
with:
exp-name: 'dsr1_1k1k'
isl: 1024
osl: 1024
max-model-len: 2048
random-range-ratio: 0.8
# dsr1-1k1k:
# needs: cleanup
# uses: ./.github/workflows/dsr1-tmpl.yml
# secrets: inherit
# with:
# exp-name: 'dsr1_1k1k'
# isl: 1024
# osl: 1024
# max-model-len: 2048
# random-range-ratio: 0.8

_70b-8k1k:
needs: cleanup
uses: ./.github/workflows/70b-tmpl.yml
secrets: inherit
with:
exp-name: '70b_8k1k'
isl: 8192
osl: 1024
max-model-len: 9216
random-range-ratio: 0.8
# _70b-8k1k:
# needs: cleanup
# uses: ./.github/workflows/70b-tmpl.yml
# secrets: inherit
# with:
# exp-name: '70b_8k1k'
# isl: 8192
# osl: 1024
# max-model-len: 9216
# random-range-ratio: 0.8

# _70b-trt-8k1k:
# needs: cleanup
# uses: ./.github/workflows/70b-trt-tmpl.yml
# secrets: inherit
# with:
# exp-name: '70b-trt_8k1k'
# isl: 8192
# osl: 1024
# max-model-len: 9216
# random-range-ratio: 0.8

dsr1-8k1k:
needs: cleanup
uses: ./.github/workflows/dsr1-tmpl.yml
secrets: inherit
with:
exp-name: 'dsr1_8k1k'
isl: 8192
osl: 1024
max-model-len: 9216
random-range-ratio: 0.8
# dsr1-8k1k:
# needs: cleanup
# uses: ./.github/workflows/dsr1-tmpl.yml
# secrets: inherit
# with:
# exp-name: 'dsr1_8k1k'
# isl: 8192
# osl: 1024
# max-model-len: 9216
# random-range-ratio: 0.8

_70b-1k8k:
needs: cleanup
uses: ./.github/workflows/70b-tmpl.yml
secrets: inherit
with:
exp-name: '70b_1k8k'
isl: 1024
osl: 8192
max-model-len: 9216
random-range-ratio: 0.8
timeout: 240
# _70b-1k8k:
# needs: cleanup
# uses: ./.github/workflows/70b-tmpl.yml
# secrets: inherit
# with:
# exp-name: '70b_1k8k'
# isl: 1024
# osl: 8192
# max-model-len: 9216
# random-range-ratio: 0.8
# timeout: 240

dsr1-1k8k:
needs: cleanup
uses: ./.github/workflows/dsr1-tmpl.yml
secrets: inherit
with:
exp-name: 'dsr1_1k8k'
isl: 1024
osl: 8192
max-model-len: 9216
random-range-ratio: 0.8
# _70b-trt-1k8k:
# needs: cleanup
# uses: ./.github/workflows/70b-trt-tmpl.yml
# secrets: inherit
# with:
# exp-name: '70b-trt_1k8k'
# isl: 1024
# osl: 8192
# max-model-len: 9216
# random-range-ratio: 0.8
# timeout: 240

# dsr1-1k8k:
# needs: cleanup
# uses: ./.github/workflows/dsr1-tmpl.yml
# secrets: inherit
# with:
# exp-name: 'dsr1_1k8k'
# isl: 1024
# osl: 8192
# max-model-len: 9216
# random-range-ratio: 0.8
63 changes: 63 additions & 0 deletions benchmarks/70b-trt_b200_slurm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/usr/bin/env bash

# === Required Env Vars ===
# HF_TOKEN
# HF_HUB_CACHE
# IMAGE
# MODEL
# ISL
# OSL
# MAX_MODEL_LEN
# RANDOM_RANGE_RATIO
# TP
# CONC
# RESULT_FILENAME
# PORT_OFFSET

echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"

set -x
hf download $MODEL
SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
PORT=$(( 8888 + $PORT_OFFSET ))

# Create llama-config.yml inline
cat > llama-config.yml << 'EOF'
enable_attention_dp: false
cuda_graph_config:
enable_padding: true
max_batch_size: 1024
kv_cache_config:
dtype: fp8
enable_block_reuse: false
stream_interval: 4
EOF

mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens $MAX_MODEL_LEN --num_postprocess_workers 2 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 &

set +x
while IFS= read -r line; do
printf '%s\n' "$line"
if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then
sleep 5
tail -n100 $SERVER_LOG
echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME"
exit 1
fi
if [[ "$line" == *"Application startup complete"* ]]; then
break
fi
done < <(tail -F -n0 "$SERVER_LOG")

set -x
git clone https://github.com/kimbochen/bench_serving.git
python3 bench_serving/benchmark_serving.py \
--model $MODEL --backend openai \
--base-url http://0.0.0.0:$PORT \
--dataset-name random \
--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
--request-rate inf --ignore-eos \
--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
--result-dir /workspace/ \
--result-filename $RESULT_FILENAME.json
Loading