From f4c7925098ef7dcc3ef1992ca409daca1bf42aa4 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 12 Nov 2025 14:19:08 -0600 Subject: [PATCH 01/60] initial poc --- benchmarks/gptoss_fp4_h100_docker.sh | 29 +++++++++++- benchmarks/gptoss_fp4_h100_slurm.sh | 1 - runners/launch_h100-cr.sh | 68 ++++++++++++++-------------- 3 files changed, 62 insertions(+), 36 deletions(-) diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index a8bb57c16..39a5abf63 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -7,6 +7,9 @@ # MAX_MODEL_LEN # TP # CONC +# ISL +# OSL + cat > config.yaml << EOF compilation-config: '{"cudagraph_mode":"PIECEWISE"}' @@ -18,6 +21,7 @@ max-model-len: 10240 EOF export PYTHONNOUSERSITE=1 +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) set -x vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ @@ -25,4 +29,27 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --gpu-memory-utilization=0.9 \ --tensor-parallel-size=$TP \ --max-num-seqs=$CONC \ ---disable-log-requests +--disable-log-requests > $SERVER_LOG 2>&1 & + +set +x +while IFS= read -r line; do + printf '%s\n' "$line" + if [[ "$line" =~ Application\ startup\ complete ]]; then + break + fi +done < <(tail -F -n0 "$SERVER_LOG") + +pip install -q datasets pandas +git clone https://github.com/kimbochen/bench_serving.git +set -x +python3 bench_serving/benchmark_serving.py \ +--model=$MODEL \ +--backend=vllm \ +--base-url=\"http://localhost:$PORT\" \ +--dataset-name=random \ +--random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ +--num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \ +--request-rate=inf --ignore-eos \ +--save-result --percentile-metrics='ttft,tpot,itl,e2el' \ +--result-dir=/workspace/ \ +--result-filename=$RESULT_FILENAME.json" \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh index d2819b5b3..e9092703a 100644 --- a/benchmarks/gptoss_fp4_h100_slurm.sh +++ b/benchmarks/gptoss_fp4_h100_slurm.sh @@ -3,7 +3,6 @@ # === Required Env Vars === # HF_TOKEN # HF_HUB_CACHE -# IMAGE # MODEL # ISL # OSL diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh index 47b350128..1eb58c32e 100644 --- a/runners/launch_h100-cr.sh +++ b/runners/launch_h100-cr.sh @@ -4,7 +4,7 @@ HF_HUB_CACHE_MOUNT="/home/ubuntu/hf_hub_cache/" PORT=8888 server_name="bmk-server" -client_name="bmk-client" +# client_name="bmk-client" set -x docker run --rm -d --network=host --name=$server_name \ @@ -17,38 +17,38 @@ docker run --rm -d --network=host --name=$server_name \ $IMAGE \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_h100_docker.sh" -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" =~ Application\ startup\ complete ]]; then - break - fi -done < <(docker logs -f --tail=0 $server_name 2>&1) - -if ! docker ps --format "{{.Names}}" | grep -q "$server_name"; then - echo "Server container launch failed." - exit 1 -fi - -git clone https://github.com/kimbochen/bench_serving.git - -set -x -docker run --rm --network=host --name=$client_name \ --v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ ---entrypoint=/bin/bash \ -$IMAGE \ --lc "pip install -q datasets pandas && \ -python3 bench_serving/benchmark_serving.py \ ---model=$MODEL \ ---backend=vllm \ ---base-url=\"http://localhost:$PORT\" \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics='ttft,tpot,itl,e2el' \ ---result-dir=/workspace/ \ ---result-filename=$RESULT_FILENAME.json" +# set +x +# while IFS= read -r line; do +# printf '%s\n' "$line" +# if [[ "$line" =~ Application\ startup\ complete ]]; then +# break +# fi +# done < <(docker logs -f --tail=0 $server_name 2>&1) + +# if ! docker ps --format "{{.Names}}" | grep -q "$server_name"; then +# echo "Server container launch failed." +# exit 1 +# fi + +# git clone https://github.com/kimbochen/bench_serving.git + +# set -x +# docker run --rm --network=host --name=$client_name \ +# -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ +# -e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ +# --entrypoint=/bin/bash \ +# $IMAGE \ +# -lc "pip install -q datasets pandas && \ +# python3 bench_serving/benchmark_serving.py \ +# --model=$MODEL \ +# --backend=vllm \ +# --base-url=\"http://localhost:$PORT\" \ +# --dataset-name=random \ +# --random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ +# --num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \ +# --request-rate=inf --ignore-eos \ +# --save-result --percentile-metrics='ttft,tpot,itl,e2el' \ +# --result-dir=/workspace/ \ +# --result-filename=$RESULT_FILENAME.json" docker stop $server_name From d0688942b4133528730b404d1f9bdc0e9c632582 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 12 Nov 2025 14:45:02 -0600 Subject: [PATCH 02/60] remove -d flag when launching docker container --- runners/launch_h100-cr.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh index 1eb58c32e..51def9743 100644 --- a/runners/launch_h100-cr.sh +++ b/runners/launch_h100-cr.sh @@ -7,7 +7,7 @@ server_name="bmk-server" # client_name="bmk-client" set -x -docker run --rm -d --network=host --name=$server_name \ +docker run --rm --network=host --name=$server_name \ --runtime=nvidia --gpus=all --ipc=host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ From 0a90d871bd8b560b350e43cc23348ddcff7070af Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 12 Nov 2025 14:50:32 -0600 Subject: [PATCH 03/60] syntax error --- benchmarks/gptoss_fp4_h100_docker.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index 39a5abf63..3700ea357 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -52,4 +52,4 @@ python3 bench_serving/benchmark_serving.py \ --request-rate=inf --ignore-eos \ --save-result --percentile-metrics='ttft,tpot,itl,e2el' \ --result-dir=/workspace/ \ ---result-filename=$RESULT_FILENAME.json" \ No newline at end of file +--result-filename=$RESULT_FILENAME.json \ No newline at end of file From 79c49cf2b641b6857e8d8265632faac8a22c6737 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 12 Nov 2025 14:58:35 -0600 Subject: [PATCH 04/60] compatibility fixes --- benchmarks/gptoss_fp4_h100_docker.sh | 11 ++++------- runners/launch_h100-cr.sh | 2 +- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index 3700ea357..aee233793 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -5,6 +5,7 @@ # HF_HUB_CACHE # MODEL # MAX_MODEL_LEN +# RANDOM_RANGE_RATIO # TP # CONC # ISL @@ -21,7 +22,6 @@ max-model-len: 10240 EOF export PYTHONNOUSERSITE=1 -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) set -x vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ @@ -32,12 +32,9 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --disable-log-requests > $SERVER_LOG 2>&1 & set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" =~ Application\ startup\ complete ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") +until curl --output /dev/null --silent --head --fail http://localhost:$PORT; do + sleep 5 +done pip install -q datasets pandas git clone https://github.com/kimbochen/bench_serving.git diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh index 51def9743..8553d9b59 100644 --- a/runners/launch_h100-cr.sh +++ b/runners/launch_h100-cr.sh @@ -11,7 +11,7 @@ docker run --rm --network=host --name=$server_name \ --runtime=nvidia --gpus=all --ipc=host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT \ +-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \ -e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ --entrypoint=/bin/bash \ $IMAGE \ From de78e18e22c0ab35bfccb7450d81293097b0cdd2 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 12 Nov 2025 14:59:51 -0600 Subject: [PATCH 05/60] add correct endpoint prefix --- benchmarks/gptoss_fp4_h100_docker.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index aee233793..80af1b8e0 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -32,7 +32,7 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --disable-log-requests > $SERVER_LOG 2>&1 & set +x -until curl --output /dev/null --silent --head --fail http://localhost:$PORT; do +until curl --output /dev/null --silent --head --fail http://localhost:$PORT/health; do sleep 5 done From 6a38bfb5d1fe66a0b0003b67c0a78ff0b622bb2a Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 12 Nov 2025 15:06:41 -0600 Subject: [PATCH 06/60] remove reference env var --- benchmarks/gptoss_fp4_h100_docker.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index 80af1b8e0..d914e6a06 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -29,7 +29,7 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --gpu-memory-utilization=0.9 \ --tensor-parallel-size=$TP \ --max-num-seqs=$CONC \ ---disable-log-requests > $SERVER_LOG 2>&1 & +--disable-log-requests set +x until curl --output /dev/null --silent --head --fail http://localhost:$PORT/health; do From f2e40ee4576a9d7887d06fdc2593b449d137a67f Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 12 Nov 2025 15:13:14 -0600 Subject: [PATCH 07/60] run vllm serve in background --- benchmarks/gptoss_fp4_h100_docker.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index d914e6a06..e4efb03ec 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -29,10 +29,10 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --gpu-memory-utilization=0.9 \ --tensor-parallel-size=$TP \ --max-num-seqs=$CONC \ ---disable-log-requests +--disable-log-requests & set +x -until curl --output /dev/null --silent --head --fail http://localhost:$PORT/health; do +until curl --output /dev/null --silent --fail http://localhost:$PORT/health; do sleep 5 done From 272c81dd56b0c43fdec85241f3e3a48222304593 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 12 Nov 2025 15:18:56 -0600 Subject: [PATCH 08/60] unescape sequences --- benchmarks/gptoss_fp4_h100_docker.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index e4efb03ec..5ba23d1ff 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -42,7 +42,7 @@ set -x python3 bench_serving/benchmark_serving.py \ --model=$MODEL \ --backend=vllm \ ---base-url=\"http://localhost:$PORT\" \ +--base-url=http://localhost:$PORT \ --dataset-name=random \ --random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ --num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \ From 4d27a9ddbd92caeaec1e177519ca8030fb663bc0 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 12 Nov 2025 15:38:27 -0600 Subject: [PATCH 09/60] stop vllm to stdout after it stops --- benchmarks/gptoss_fp4_h100_docker.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index 5ba23d1ff..ae5fb2a9f 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -31,11 +31,17 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --max-num-seqs=$CONC \ --disable-log-requests & +SERVER_PID=$! set +x +tail -f /tmp/vllm_server.log & +TAIL_PID=$! + until curl --output /dev/null --silent --fail http://localhost:$PORT/health; do sleep 5 done +kill $TAIL_PID 2>/dev/null + pip install -q datasets pandas git clone https://github.com/kimbochen/bench_serving.git set -x From 5f549dc8c7da6180950cdfa27abfd7f052de48ab Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 12 Nov 2025 15:41:45 -0600 Subject: [PATCH 10/60] stop vllm to stdout after it stops pt 2 --- benchmarks/gptoss_fp4_h100_docker.sh | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index ae5fb2a9f..4ef463bf1 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -29,18 +29,16 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --gpu-memory-utilization=0.9 \ --tensor-parallel-size=$TP \ --max-num-seqs=$CONC \ ---disable-log-requests & +--disable-log-requests 2>&1 | tee $(mktemp /tmp/server-XXXXXX.log) & -SERVER_PID=$! +VLLM_PID=$! set +x -tail -f /tmp/vllm_server.log & -TAIL_PID=$! until curl --output /dev/null --silent --fail http://localhost:$PORT/health; do sleep 5 done -kill $TAIL_PID 2>/dev/null +pkill -P $$ tee 2>/dev/null pip install -q datasets pandas git clone https://github.com/kimbochen/bench_serving.git From a3e70648f60577d3bbb1d8ec7a0bed5eb9de8bf9 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 12 Nov 2025 15:47:37 -0600 Subject: [PATCH 11/60] get rid of docker stop as no longer in detatched --- runners/launch_h100-cr.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh index 8553d9b59..a34b31c88 100644 --- a/runners/launch_h100-cr.sh +++ b/runners/launch_h100-cr.sh @@ -51,4 +51,4 @@ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_h100_docker.sh" # --result-dir=/workspace/ \ # --result-filename=$RESULT_FILENAME.json" -docker stop $server_name +# docker stop $server_name From cabe36239bb80ebfaf44d18d7f3d9938a6b01c7d Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 12 Nov 2025 16:01:09 -0600 Subject: [PATCH 12/60] clone bench serving to tmp dir --- benchmarks/gptoss_fp4_h100_docker.sh | 5 +++-- runners/launch_h100-cr.sh | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index 4ef463bf1..5420c220d 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -41,9 +41,10 @@ done pkill -P $$ tee 2>/dev/null pip install -q datasets pandas -git clone https://github.com/kimbochen/bench_serving.git +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR set -x -python3 bench_serving/benchmark_serving.py \ +python3 $BENCH_SERVING_DIR/bench_serving/benchmark_serving.py \ --model=$MODEL \ --backend=vllm \ --base-url=http://localhost:$PORT \ diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh index a34b31c88..18c791614 100644 --- a/runners/launch_h100-cr.sh +++ b/runners/launch_h100-cr.sh @@ -12,7 +12,7 @@ docker run --rm --network=host --name=$server_name \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \ --e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ +-e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ --entrypoint=/bin/bash \ $IMAGE \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_h100_docker.sh" From 30209264c7b748e0ec4cec4877c37f0f41b14fa4 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 12 Nov 2025 16:07:27 -0600 Subject: [PATCH 13/60] clone bench serving to tmp dir pt 2 --- benchmarks/gptoss_fp4_h100_docker.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index 5420c220d..2c8bfb3c5 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -44,7 +44,7 @@ pip install -q datasets pandas BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR set -x -python3 $BENCH_SERVING_DIR/bench_serving/benchmark_serving.py \ +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model=$MODEL \ --backend=vllm \ --base-url=http://localhost:$PORT \ From 142923bce8e520cc0782e51492027514c71d89e4 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 12 Nov 2025 16:14:36 -0600 Subject: [PATCH 14/60] add explanatory comment --- benchmarks/gptoss_fp4_h100_docker.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index 2c8bfb3c5..6229bed85 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -31,13 +31,12 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --max-num-seqs=$CONC \ --disable-log-requests 2>&1 | tee $(mktemp /tmp/server-XXXXXX.log) & +# Show server logs til' it is up, then stop showing VLLM_PID=$! set +x - until curl --output /dev/null --silent --fail http://localhost:$PORT/health; do sleep 5 done - pkill -P $$ tee 2>/dev/null pip install -q datasets pandas From 39fdbcece60facc9a138f14d1cedc9f3e17e61f3 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 12 Nov 2025 16:35:43 -0600 Subject: [PATCH 15/60] cleaning up --- runners/launch_h100-cr.sh | 37 ------------------------------------- 1 file changed, 37 deletions(-) diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh index 18c791614..d1ddc26de 100644 --- a/runners/launch_h100-cr.sh +++ b/runners/launch_h100-cr.sh @@ -4,7 +4,6 @@ HF_HUB_CACHE_MOUNT="/home/ubuntu/hf_hub_cache/" PORT=8888 server_name="bmk-server" -# client_name="bmk-client" set -x docker run --rm --network=host --name=$server_name \ @@ -16,39 +15,3 @@ docker run --rm --network=host --name=$server_name \ --entrypoint=/bin/bash \ $IMAGE \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_h100_docker.sh" - -# set +x -# while IFS= read -r line; do -# printf '%s\n' "$line" -# if [[ "$line" =~ Application\ startup\ complete ]]; then -# break -# fi -# done < <(docker logs -f --tail=0 $server_name 2>&1) - -# if ! docker ps --format "{{.Names}}" | grep -q "$server_name"; then -# echo "Server container launch failed." -# exit 1 -# fi - -# git clone https://github.com/kimbochen/bench_serving.git - -# set -x -# docker run --rm --network=host --name=$client_name \ -# -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -# -e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ -# --entrypoint=/bin/bash \ -# $IMAGE \ -# -lc "pip install -q datasets pandas && \ -# python3 bench_serving/benchmark_serving.py \ -# --model=$MODEL \ -# --backend=vllm \ -# --base-url=\"http://localhost:$PORT\" \ -# --dataset-name=random \ -# --random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ -# --num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \ -# --request-rate=inf --ignore-eos \ -# --save-result --percentile-metrics='ttft,tpot,itl,e2el' \ -# --result-dir=/workspace/ \ -# --result-filename=$RESULT_FILENAME.json" - -# docker stop $server_name From 6e100581c9272f7f6be7d44a41a6985cdbe30fab Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 09:13:56 -0600 Subject: [PATCH 16/60] cleaning up --- benchmarks/gptoss_fp4_mi355x_docker.sh | 38 +++++++++++++++++++++++++- runners/launch_mi355x-amd.sh | 8 +++--- 2 files changed, 41 insertions(+), 5 deletions(-) diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh index 103e77fe3..de5ce9ce7 100644 --- a/benchmarks/gptoss_fp4_mi355x_docker.sh +++ b/benchmarks/gptoss_fp4_mi355x_docker.sh @@ -30,4 +30,40 @@ vllm serve $MODEL --port $PORT \ --block-size=64 \ --no-enable-prefix-caching \ --disable-log-requests \ ---async-scheduling +--async-scheduling | tee $(mktemp /tmp/server-XXXXXX.log) & + +# Show server logs til' it is up, then stop showing +VLLM_PID=$! +set +x +until curl --output /dev/null --silent --fail http://localhost:$PORT/health; do + sleep 5 +done +pkill -P $$ tee 2>/dev/null + +if [[ "$MODEL" == "amd/DeepSeek-R1-0528-MXFP4-Preview" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then + if [[ "$OSL" == "8192" ]]; then + NUM_PROMPTS=$(( CONC * 20 )) + else + NUM_PROMPTS=$(( CONC * 50 )) + fi +else + NUM_PROMPTS=$(( CONC * 10 )) +fi + +git clone https://github.com/kimbochen/bench_serving.git + +set -x +docker run --rm --network=$network_name --name=$client_name \ +-v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ +-e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ +--entrypoint=python3 \ +$IMAGE \ +bench_serving/benchmark_serving.py \ +--model=$MODEL --backend=vllm --base-url="http://localhost:$PORT" \ +--dataset-name=random \ +--random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ +--num-prompts=$NUM_PROMPTS \ +--max-concurrency=$CONC \ +--request-rate=inf --ignore-eos \ +--save-result --percentile-metrics="ttft,tpot,itl,e2el" \ +--result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json diff --git a/runners/launch_mi355x-amd.sh b/runners/launch_mi355x-amd.sh index 87ee8cbd2..b3cecf6e4 100644 --- a/runners/launch_mi355x-amd.sh +++ b/runners/launch_mi355x-amd.sh @@ -17,14 +17,14 @@ HF_HUB_CACHE_MOUNT="/nfsdata/hf_hub_cache-1/" # Temp solution PORT=8888 -network_name="bmk-net" +# network_name="bmk-net" server_name="bmk-server" -client_name="bmk-client" +# client_name="bmk-client" -docker network create $network_name +# docker network create $network_name set -x -docker run --rm -d --ipc=host --shm-size=16g --network=$network_name --name=$server_name \ +docker run --rm -d --ipc=host --shm-size=16g --network=host --name=$server_name \ --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ From 710793d80fb48efd6c9d0f14bd91273073747c01 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 10:57:36 -0600 Subject: [PATCH 17/60] adding mi355x refactor --- benchmarks/dsr1_fp8_mi355x_docker.sh | 33 ++++++++++- benchmarks/gptoss_fp4_h100_docker.sh | 1 - benchmarks/gptoss_fp4_mi355x_docker.sh | 24 ++------ runners/launch_mi355x-amd.sh | 78 +++++++++++++------------- 4 files changed, 76 insertions(+), 60 deletions(-) diff --git a/benchmarks/dsr1_fp8_mi355x_docker.sh b/benchmarks/dsr1_fp8_mi355x_docker.sh index f39a8dbbd..baad70fd8 100644 --- a/benchmarks/dsr1_fp8_mi355x_docker.sh +++ b/benchmarks/dsr1_fp8_mi355x_docker.sh @@ -24,5 +24,36 @@ python3 -m sglang.launch_server \ --mem-fraction-static 0.8 --disable-radix-cache \ --num-continuous-decode-steps 4 \ --max-prefill-tokens 196608 \ - --cuda-graph-max-bs 128 + --cuda-graph-max-bs 128 | tee $(mktemp /tmp/server-XXXXXX.log) & + +set +x +until curl --output /dev/null --silent --fail http://localhost:$PORT/health; do + sleep 5 +done +pkill -P $$ tee 2>/dev/null + +if [[ "$MODEL" == "amd/DeepSeek-R1-0528-MXFP4-Preview" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then + if [[ "$OSL" == "8192" ]]; then + NUM_PROMPTS=$(( CONC * 20 )) + else + NUM_PROMPTS=$(( CONC * 50 )) + fi +else + NUM_PROMPTS=$(( CONC * 10 )) +fi + +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +set -x +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ +--model=$MODEL --backend=vllm --base-url="http://localhost:$PORT" \ +--dataset-name=random \ +--random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ +--num-prompts=$NUM_PROMPTS \ +--max-concurrency=$CONC \ +--request-rate=inf --ignore-eos \ +--save-result --percentile-metrics="ttft,tpot,itl,e2el" \ +--result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json + + diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index 6229bed85..1b4453be3 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -32,7 +32,6 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --disable-log-requests 2>&1 | tee $(mktemp /tmp/server-XXXXXX.log) & # Show server logs til' it is up, then stop showing -VLLM_PID=$! set +x until curl --output /dev/null --silent --fail http://localhost:$PORT/health; do sleep 5 diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh index de5ce9ce7..533f5e212 100644 --- a/benchmarks/gptoss_fp4_mi355x_docker.sh +++ b/benchmarks/gptoss_fp4_mi355x_docker.sh @@ -8,6 +8,8 @@ # TP # CONC # MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# RESULT_FILENAME cat > config.yaml << EOF compilation-config: '{"compile_sizes":[1,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98,100,102,104,106,108,110,112,114,116,118,120,122,124,126,128,256,512,1024,2048,8192] , "cudagraph_capture_sizes":[1,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98,100,102,104,106,108,110,112,114,116,118,120,122,124,126,128,136,144,152,160,168,176,184,192,200,208,216,224,232,240,248,256,264,272,280,288,296,304,312,320,328,336,344,352,360,368,376,384,392,400,408,416,424,432,440,448,456,464,472,480,488,496,504,512,520,528,536,544,552,560,568,576,584,592,600,608,616,624,632,640,648,656,664,672,680,688,696,704,712,720,728,736,744,752,760,768,776,784,792,800,808,816,824,832,840,848,856,864,872,880,888,896,904,912,920,928,936,944,952,960,968,976,984,992,1000,1008,1016,1024,2048,4096,8192] , "cudagraph_mode": "FULL_AND_PIECEWISE"}' @@ -33,32 +35,16 @@ vllm serve $MODEL --port $PORT \ --async-scheduling | tee $(mktemp /tmp/server-XXXXXX.log) & # Show server logs til' it is up, then stop showing -VLLM_PID=$! set +x until curl --output /dev/null --silent --fail http://localhost:$PORT/health; do sleep 5 done pkill -P $$ tee 2>/dev/null -if [[ "$MODEL" == "amd/DeepSeek-R1-0528-MXFP4-Preview" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then - if [[ "$OSL" == "8192" ]]; then - NUM_PROMPTS=$(( CONC * 20 )) - else - NUM_PROMPTS=$(( CONC * 50 )) - fi -else - NUM_PROMPTS=$(( CONC * 10 )) -fi - -git clone https://github.com/kimbochen/bench_serving.git - +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR set -x -docker run --rm --network=$network_name --name=$client_name \ --v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ ---entrypoint=python3 \ -$IMAGE \ -bench_serving/benchmark_serving.py \ +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model=$MODEL --backend=vllm --base-url="http://localhost:$PORT" \ --dataset-name=random \ --random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ diff --git a/runners/launch_mi355x-amd.sh b/runners/launch_mi355x-amd.sh index b3cecf6e4..009a53108 100644 --- a/runners/launch_mi355x-amd.sh +++ b/runners/launch_mi355x-amd.sh @@ -24,52 +24,52 @@ server_name="bmk-server" # docker network create $network_name set -x -docker run --rm -d --ipc=host --shm-size=16g --network=host --name=$server_name \ +docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \ --e ISL -e OSL \ +-e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \ --entrypoint=/bin/bash \ $IMAGE \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi355x_docker.sh" -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" =~ Application\ startup\ complete ]]; then - break - fi -done < <(docker logs -f --tail=0 $server_name 2>&1) +# set +x +# while IFS= read -r line; do +# printf '%s\n' "$line" +# if [[ "$line" =~ Application\ startup\ complete ]]; then +# break +# fi +# done < <(docker logs -f --tail=0 $server_name 2>&1) -if [[ "$MODEL" == "amd/DeepSeek-R1-0528-MXFP4-Preview" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then - if [[ "$OSL" == "8192" ]]; then - NUM_PROMPTS=$(( CONC * 20 )) - else - NUM_PROMPTS=$(( CONC * 50 )) - fi -else - NUM_PROMPTS=$(( CONC * 10 )) -fi +# if [[ "$MODEL" == "amd/DeepSeek-R1-0528-MXFP4-Preview" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then +# if [[ "$OSL" == "8192" ]]; then +# NUM_PROMPTS=$(( CONC * 20 )) +# else +# NUM_PROMPTS=$(( CONC * 50 )) +# fi +# else +# NUM_PROMPTS=$(( CONC * 10 )) +# fi -git clone https://github.com/kimbochen/bench_serving.git +# git clone https://github.com/kimbochen/bench_serving.git -set -x -docker run --rm --network=$network_name --name=$client_name \ --v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ ---entrypoint=python3 \ -$IMAGE \ -bench_serving/benchmark_serving.py \ ---model=$MODEL --backend=vllm --base-url="http://$server_name:$PORT" \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$NUM_PROMPTS \ ---max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics="ttft,tpot,itl,e2el" \ ---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json +# set -x +# docker run --rm --network=$network_name --name=$client_name \ +# -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ +# -e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ +# --entrypoint=python3 \ +# $IMAGE \ +# bench_serving/benchmark_serving.py \ +# --model=$MODEL --backend=vllm --base-url="http://$server_name:$PORT" \ +# --dataset-name=random \ +# --random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ +# --num-prompts=$NUM_PROMPTS \ +# --max-concurrency=$CONC \ +# --request-rate=inf --ignore-eos \ +# --save-result --percentile-metrics="ttft,tpot,itl,e2el" \ +# --result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json if ls gpucore.* 1> /dev/null 2>&1; then echo "gpucore files exist. not good" @@ -77,8 +77,8 @@ if ls gpucore.* 1> /dev/null 2>&1; then fi -while [ -n "$(docker ps -aq)" ]; do - docker stop $server_name - docker network rm $network_name - sleep 5 -done +# while [ -n "$(docker ps -aq)" ]; do +# docker stop $server_name +# # docker network rm $network_name +# sleep 5 +# done From 6b844a191dc2dea8faf0c867a82fff18e931eac8 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 11:37:44 -0600 Subject: [PATCH 18/60] adding h200 initial refactor --- benchmarks/gptoss_fp4_h200_trt_slurm.sh | 34 +++++++++++++++++-------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh index c148a3cb7..7e411b05a 100644 --- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh @@ -19,7 +19,7 @@ echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" hf download $MODEL -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +# SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=$(( 8888 + $PORT_OFFSET )) @@ -45,20 +45,32 @@ stream_interval: 20 EOF #mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens $MAX_MODEL_LEN --num_postprocess_workers 2 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 & -mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --max_batch_size $CONC --max_num_tokens 20000 --backend pytorch --extra_llm_api_options gptoss-config.yml --ep_size=$EP_SIZE --trust_remote_code --gpus_per_node 8 --host 0.0.0.0 --port $PORT --tp_size=$TP --pp_size=1 > $SERVER_LOG 2>&1 & - +mpirun -n 1 --oversubscribe --allow-run-as-root \ +trtllm-serve $MODEL \ +--max_batch_size $CONC \ +--max_num_tokens 20000 \ +--backend pytorch \ +--extra_llm_api_options gptoss-config.yml \ +--ep_size=$EP_SIZE \ +--trust_remote_code \ +--gpus_per_node 8 \ +--host 0.0.0.0 \ +--port $PORT \ +--tp_size=$TP \ +--pp_size=1 \ +2>&1 | tee $(mktemp /tmp/server-XXXXXX.log) & +# Show server logs til' it is up, then stop showing set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +pkill -P $$ tee 2>/dev/null set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model $MODEL --backend openai \ --base-url http://0.0.0.0:$PORT \ --dataset-name random \ From 7ed5aa91372eb7f4ece7ffbe56f6fd9b2d811e26 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 15:46:11 -0600 Subject: [PATCH 19/60] different way to see server logs --- benchmarks/gptoss_fp4_h200_trt_slurm.sh | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh index 7e411b05a..ac084ef13 100644 --- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh @@ -44,6 +44,8 @@ print_iter_log: true stream_interval: 20 EOF +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) + #mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens $MAX_MODEL_LEN --num_postprocess_workers 2 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 & mpirun -n 1 --oversubscribe --allow-run-as-root \ trtllm-serve $MODEL \ @@ -58,14 +60,18 @@ trtllm-serve $MODEL \ --port $PORT \ --tp_size=$TP \ --pp_size=1 \ -2>&1 | tee $(mktemp /tmp/server-XXXXXX.log) & +> $SERVER_LOG 2>&1 & + +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! # Show server logs til' it is up, then stop showing set +x until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do sleep 5 done -pkill -P $$ tee 2>/dev/null +kill $TAIL_PID set -x BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) From fac88645fc7450946c7198f9de6ce5f024e364ff Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 15:59:58 -0600 Subject: [PATCH 20/60] cleanup --- benchmarks/dsr1_fp8_h200_trt_slurm.sh | 18 ++++++++++-------- benchmarks/gptoss_fp4_h100_docker.sh | 13 ++++++++----- benchmarks/gptoss_fp4_h100_slurm.sh | 18 ++++++++++-------- benchmarks/gptoss_fp4_h200_trt_slurm.sh | 7 +------ benchmarks/gptoss_fp4_mi355x_slurm.sh | 18 ++++++++++-------- 5 files changed, 39 insertions(+), 35 deletions(-) diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh index 7b566c0ab..58d7e9724 100644 --- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh @@ -71,17 +71,19 @@ PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \ > $SERVER_LOG 2>&1 & +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID -git clone https://github.com/kimbochen/bench_serving.git set -x -python3 bench_serving/benchmark_serving.py \ +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model $MODEL --backend openai \ --base-url http://0.0.0.0:$PORT \ --dataset-name random \ diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index 1b4453be3..ae889474c 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -22,6 +22,7 @@ max-model-len: 10240 EOF export PYTHONNOUSERSITE=1 +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) set -x vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ @@ -29,19 +30,21 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --gpu-memory-utilization=0.9 \ --tensor-parallel-size=$TP \ --max-num-seqs=$CONC \ ---disable-log-requests 2>&1 | tee $(mktemp /tmp/server-XXXXXX.log) & +--disable-log-requests > $SERVER_LOG 2>&1 & -# Show server logs til' it is up, then stop showing +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! set +x -until curl --output /dev/null --silent --fail http://localhost:$PORT/health; do +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do sleep 5 done -pkill -P $$ tee 2>/dev/null +kill $TAIL_PID pip install -q datasets pandas +set -x BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -set -x python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model=$MODEL \ --backend=vllm \ diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh index e9092703a..d82bebf72 100644 --- a/benchmarks/gptoss_fp4_h100_slurm.sh +++ b/benchmarks/gptoss_fp4_h100_slurm.sh @@ -35,18 +35,20 @@ PYTHONNOUSERSITE=1 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --max-num-seqs=$CONC \ --disable-log-requests > $SERVER_LOG 2>&1 & +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID pip install -q datasets pandas -git clone https://github.com/kimbochen/bench_serving.git set -x -python3 bench_serving/benchmark_serving.py \ +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model=$MODEL \ --backend=vllm \ --base-url="http://0.0.0.0:$PORT" \ diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh index ac084ef13..0927a0d61 100644 --- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh @@ -19,7 +19,7 @@ echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" hf download $MODEL -# SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=$(( 8888 + $PORT_OFFSET )) @@ -44,9 +44,6 @@ print_iter_log: true stream_interval: 20 EOF -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) - -#mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens $MAX_MODEL_LEN --num_postprocess_workers 2 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 & mpirun -n 1 --oversubscribe --allow-run-as-root \ trtllm-serve $MODEL \ --max_batch_size $CONC \ @@ -65,8 +62,6 @@ trtllm-serve $MODEL \ # Show logs until server is ready tail -f $SERVER_LOG & TAIL_PID=$! - -# Show server logs til' it is up, then stop showing set +x until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do sleep 5 diff --git a/benchmarks/gptoss_fp4_mi355x_slurm.sh b/benchmarks/gptoss_fp4_mi355x_slurm.sh index 657bc1fdf..1fcba771f 100644 --- a/benchmarks/gptoss_fp4_mi355x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi355x_slurm.sh @@ -38,17 +38,19 @@ vllm serve $MODEL --port $PORT \ --disable-log-requests \ --async-scheduling > $SERVER_LOG 2>&1 & +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model $MODEL --backend vllm \ --base-url "http://0.0.0.0:$PORT" \ --dataset-name random \ From 7c8f5a57d6854330a0bb3641616cb28ba8147a7e Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 16:39:39 -0600 Subject: [PATCH 21/60] now fail if server fails --- benchmarks/dsr1_fp8_h200_trt_slurm.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh index 58d7e9724..a44769bc6 100644 --- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh @@ -69,6 +69,7 @@ PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \ --tp_size=$TP --ep_size=$EP_SIZE \ --extra_llm_api_options=$EXTRA_CONFIG_FILE \ > $SERVER_LOG 2>&1 & +SERVER_PID=$! # Show logs until server is ready @@ -76,6 +77,10 @@ tail -f $SERVER_LOG & TAIL_PID=$! set +x until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + if ! kill -0 $SERVER_PID 2>/dev/null; then + echo "Server died before becoming healthy. Exiting." + exit 1 + fi sleep 5 done kill $TAIL_PID From 9dd2b1459218803376eec6e488f6957f9929836c Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 17:10:51 -0600 Subject: [PATCH 22/60] starting on b200 --- benchmarks/gptoss_fp4_b200_docker.sh | 28 +++++++++++++++++++++-- runners/launch_b200-tg.sh | 34 +--------------------------- 2 files changed, 27 insertions(+), 35 deletions(-) diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh index fd6ac15c5..28f3d29cf 100644 --- a/benchmarks/gptoss_fp4_b200_docker.sh +++ b/benchmarks/gptoss_fp4_b200_docker.sh @@ -43,8 +43,32 @@ export TORCH_CUDA_ARCH_LIST="10.0" export VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB='{"2":32,"4":32,"8":8}' export PYTHONNOUSERSITE=1 export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1 +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) set -x vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \ ---gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs 512 \ ---disable-log-requests +--gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC \ +--disable-log-requests > $SERVER_LOG 2>&1 & + +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! +set +x +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID + +pip install -q datasets pandas +set -x +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ +--model $MODEL --backend vllm --base-url http://localhost:$PORT \ +--dataset-name random \ +--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ +--num-prompts $(( $CONC * 10 )) \ +--max-concurrency $CONC \ +--request-rate inf --ignore-eos \ +--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ +--result-dir /workspace/ --result-filename $RESULT_FILENAME.json \ No newline at end of file diff --git a/runners/launch_b200-tg.sh b/runners/launch_b200-tg.sh index 9f313396c..97e975a64 100644 --- a/runners/launch_b200-tg.sh +++ b/runners/launch_b200-tg.sh @@ -5,7 +5,6 @@ FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') PORT=8888 server_name="bmk-server" -client_name="bmk-client" set -x docker run --rm -d --network host --name $server_name \ @@ -14,38 +13,7 @@ docker run --rm -d --network host --name $server_name \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \ -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ +-e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ --entrypoint=/bin/bash \ $(echo "$IMAGE" | sed 's/#/\//') \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh" - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" =~ Application\ startup\ complete ]]; then - break - fi -done < <(docker logs -f --tail=0 $server_name 2>&1) - -git clone https://github.com/kimbochen/bench_serving.git - -set -x -docker run --rm --network host --name $client_name \ --v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ ---entrypoint=/bin/bash \ -$(echo "$IMAGE" | sed 's/#/\//') \ --lc "pip install -q datasets pandas && \ -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend vllm --base-url http://localhost:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) \ ---max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ --result-filename $RESULT_FILENAME.json" - -while [ -n "$(docker ps -aq)" ]; do - docker stop $server_name - sleep 5 -done From 69991c64edc6d8a843ffdaa2a7f18674fb2daa29 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 17:20:53 -0600 Subject: [PATCH 23/60] doign b200 --- benchmarks/dsr1_fp4_b200_docker.sh | 27 ++++++++++++++++++++++- benchmarks/dsr1_fp4_b200_trt_slurm.sh | 18 +++++++++------- benchmarks/dsr1_fp8_b200_docker.sh | 26 +++++++++++++++++++++- benchmarks/dsr1_fp8_b200_trt_slurm.sh | 31 +++++++++++++++------------ benchmarks/gptoss_fp4_b200_docker.sh | 2 ++ 5 files changed, 80 insertions(+), 24 deletions(-) diff --git a/benchmarks/dsr1_fp4_b200_docker.sh b/benchmarks/dsr1_fp4_b200_docker.sh index 3c8232072..6b2112478 100644 --- a/benchmarks/dsr1_fp4_b200_docker.sh +++ b/benchmarks/dsr1_fp4_b200_docker.sh @@ -6,6 +6,8 @@ nvidia-smi # happens 1% of the time. ref: https://github.com/flashinfer-ai/flashinfer/pull/1779 sed -i '102,108d' /usr/local/lib/python3.12/dist-packages/flashinfer/jit/cubin_loader.py +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) + # Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls. if [[ $CONC -ge 16 ]]; then SCHEDULER_RECV_INTERVAL=30 @@ -22,5 +24,28 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0. --cuda-graph-max-bs 256 --max-running-requests 256 --mem-fraction-static 0.85 --kv-cache-dtype fp8_e4m3 \ --chunked-prefill-size 16384 \ --ep-size $EP_SIZE --quantization modelopt_fp4 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ ---enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 +--enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 > $SERVER_LOG 2>&1 & + +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! +set +x +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID + +pip install -q datasets pandas +set -x +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ +--model $MODEL --backend vllm --base-url http://localhost:$PORT \ +--dataset-name random \ +--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ +--num-prompts $(( $CONC * 10 )) \ +--max-concurrency $CONC \ +--request-rate inf --ignore-eos \ +--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ +--result-dir /workspace/ --result-filename $RESULT_FILENAME.json diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh index 6f4f814a0..6896880fb 100644 --- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh @@ -101,17 +101,19 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \ > $SERVER_LOG 2>&1 & +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID -git clone https://github.com/kimbochen/bench_serving.git set -x -python3 bench_serving/benchmark_serving.py \ +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model $MODEL --backend openai \ --base-url http://0.0.0.0:$PORT \ --dataset-name random \ diff --git a/benchmarks/dsr1_fp8_b200_docker.sh b/benchmarks/dsr1_fp8_b200_docker.sh index 361b6f1f6..babb5c9a6 100644 --- a/benchmarks/dsr1_fp8_b200_docker.sh +++ b/benchmarks/dsr1_fp8_b200_docker.sh @@ -17,6 +17,7 @@ sed -i '102,108d' /usr/local/lib/python3.12/dist-packages/flashinfer/jit/cubin_l export SGL_ENABLE_JIT_DEEPGEMM=false export SGLANG_ENABLE_FLASHINFER_GEMM=true +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) # Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls. if [[ $CONC -ge 16 ]]; then @@ -34,4 +35,27 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0. --cuda-graph-max-bs 128 --max-running-requests 128 \ --mem-fraction-static 0.82 --kv-cache-dtype fp8_e4m3 --chunked-prefill-size 32768 --max-prefill-tokens 32768 \ --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL --disable-radix-cache \ ---attention-backend trtllm_mla --stream-interval 30 --ep-size $EP_SIZE --moe-runner-backend flashinfer_trtllm --quantization fp8 +--attention-backend trtllm_mla --stream-interval 30 --ep-size $EP_SIZE --moe-runner-backend flashinfer_trtllm --quantization fp8 > $SERVER_LOG 2>&1 & + +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! +set +x +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID + +pip install -q datasets pandas +set -x +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ +--model $MODEL --backend vllm --base-url http://localhost:$PORT \ +--dataset-name random \ +--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ +--num-prompts $(( $CONC * 10 )) \ +--max-concurrency $CONC \ +--request-rate inf --ignore-eos \ +--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ +--result-dir /workspace/ --result-filename $RESULT_FILENAME.json \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh index 58d4525f1..81fc4137b 100644 --- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh @@ -69,25 +69,28 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \ --tp_size=$TP --ep_size=$EP_SIZE \ --extra_llm_api_options=$EXTRA_CONFIG_FILE \ > $SERVER_LOG 2>&1 & + +SERVER_PID=$! - +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID -git clone https://github.com/kimbochen/bench_serving.git +pip install -q datasets pandas set -x -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend openai \ ---base-url http://0.0.0.0:$PORT \ +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ +--model $MODEL --backend vllm --base-url http://localhost:$PORT \ --dataset-name random \ --random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ +--num-prompts $(( $CONC * 10 )) \ +--max-concurrency $CONC \ --request-rate inf --ignore-eos \ --save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json +--result-dir /workspace/ --result-filename $RESULT_FILENAME.json diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh index 28f3d29cf..f28f525c0 100644 --- a/benchmarks/gptoss_fp4_b200_docker.sh +++ b/benchmarks/gptoss_fp4_b200_docker.sh @@ -50,6 +50,8 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \ --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC \ --disable-log-requests > $SERVER_LOG 2>&1 & +SERVER_PID=$! + # Show logs until server is ready tail -f $SERVER_LOG & TAIL_PID=$! From bdb3d390b8981a4f145a4e5386dfb1d28101d910 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 17:22:18 -0600 Subject: [PATCH 24/60] reverting erroneous change --- benchmarks/dsr1_fp8_b200_trt_slurm.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh index 81fc4137b..741ecdb92 100644 --- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh @@ -69,7 +69,7 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \ --tp_size=$TP --ep_size=$EP_SIZE \ --extra_llm_api_options=$EXTRA_CONFIG_FILE \ > $SERVER_LOG 2>&1 & - + SERVER_PID=$! # Show logs until server is ready @@ -81,16 +81,16 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID -pip install -q datasets pandas set -x BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model $MODEL --backend vllm --base-url http://localhost:$PORT \ +--model $MODEL --backend openai \ +--base-url http://0.0.0.0:$PORT \ --dataset-name random \ --random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) \ ---max-concurrency $CONC \ +--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ --request-rate inf --ignore-eos \ --save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ --result-filename $RESULT_FILENAME.json +--result-dir /workspace/ \ +--result-filename $RESULT_FILENAME.json From a2a3db8c4d5a5297eee65e0100810738e0add759 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 18:00:48 -0600 Subject: [PATCH 25/60] fixing b200 --- benchmarks/dsr1_fp4_b200_docker.sh | 2 +- benchmarks/dsr1_fp8_b200_docker.sh | 2 +- benchmarks/gptoss_fp4_b200_docker.sh | 2 +- runners/launch_b200-nvd.sh | 44 +++++----------------------- 4 files changed, 11 insertions(+), 39 deletions(-) diff --git a/benchmarks/dsr1_fp4_b200_docker.sh b/benchmarks/dsr1_fp4_b200_docker.sh index 6b2112478..8b9f116c6 100644 --- a/benchmarks/dsr1_fp4_b200_docker.sh +++ b/benchmarks/dsr1_fp4_b200_docker.sh @@ -43,7 +43,7 @@ python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model $MODEL --backend vllm --base-url http://localhost:$PORT \ --dataset-name random \ --random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) \ +--num-prompts $NUM_PROMPTS \ --max-concurrency $CONC \ --request-rate inf --ignore-eos \ --save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ diff --git a/benchmarks/dsr1_fp8_b200_docker.sh b/benchmarks/dsr1_fp8_b200_docker.sh index babb5c9a6..f1412264c 100644 --- a/benchmarks/dsr1_fp8_b200_docker.sh +++ b/benchmarks/dsr1_fp8_b200_docker.sh @@ -54,7 +54,7 @@ python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model $MODEL --backend vllm --base-url http://localhost:$PORT \ --dataset-name random \ --random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) \ +--num-prompts $NUM_PROMPTS \ --max-concurrency $CONC \ --request-rate inf --ignore-eos \ --save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh index f28f525c0..530e61373 100644 --- a/benchmarks/gptoss_fp4_b200_docker.sh +++ b/benchmarks/gptoss_fp4_b200_docker.sh @@ -69,7 +69,7 @@ python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model $MODEL --backend vllm --base-url http://localhost:$PORT \ --dataset-name random \ --random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) \ +--num-prompts $NUM_PROMPTS \ --max-concurrency $CONC \ --request-rate inf --ignore-eos \ --save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ diff --git a/runners/launch_b200-nvd.sh b/runners/launch_b200-nvd.sh index 21a10d48f..a2587b477 100644 --- a/runners/launch_b200-nvd.sh +++ b/runners/launch_b200-nvd.sh @@ -25,29 +25,6 @@ set -x # Disabling it can reduce perf but will improve CI stability. i.e. we won't see vLLM/Sglang crashes. # Ref: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-graph-register - -docker run --rm -d --init --network host --name $server_name \ ---runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ --v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \ --e NCCL_GRAPH_REGISTER=0 \ --e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ ---entrypoint=/bin/bash \ -$(echo "$IMAGE" | sed 's/#/\//') \ -benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh" - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" =~ Application\ startup\ complete ]]; then - break - fi -done < <(docker logs -f --tail=0 $server_name 2>&1) - -git clone https://github.com/kimbochen/bench_serving.git - - if [[ "$MODEL" == "nvidia/DeepSeek-R1-0528-FP4" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then if [[ "$OSL" == "8192" ]]; then NUM_PROMPTS=$(( CONC * 20 )) @@ -58,22 +35,17 @@ else NUM_PROMPTS=$(( CONC * 10 )) fi -set -x -docker run --rm --network host --name $client_name \ +docker run --rm --init --network host --name $server_name \ +--runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ +-v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ +-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \ +-e NCCL_GRAPH_REGISTER=0 \ +-e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ +-e -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e NUM_PROMPTS \ --entrypoint=/bin/bash \ $(echo "$IMAGE" | sed 's/#/\//') \ --lc "pip install -q datasets pandas && \ -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend vllm --base-url http://localhost:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $NUM_PROMPTS \ ---max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ --result-filename $RESULT_FILENAME.json" +benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh" # Try graceful first docker stop -t 90 "$server_name" || true From f682f0943c0ee8b1b5524bdabba48c4629b5fe0c Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 18:02:36 -0600 Subject: [PATCH 26/60] fixing b200 pt 2 --- runners/launch_b200-nvd.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_b200-nvd.sh b/runners/launch_b200-nvd.sh index a2587b477..47c7c979f 100644 --- a/runners/launch_b200-nvd.sh +++ b/runners/launch_b200-nvd.sh @@ -42,7 +42,7 @@ docker run --rm --init --network host --name $server_name \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \ -e NCCL_GRAPH_REGISTER=0 \ -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ --e -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e NUM_PROMPTS \ +-e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e NUM_PROMPTS \ --entrypoint=/bin/bash \ $(echo "$IMAGE" | sed 's/#/\//') \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh" From 9eee9fd9611e2149ea1e9f0936524b25810aa3c0 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 18:16:35 -0600 Subject: [PATCH 27/60] updating mi300 --- benchmarks/dsr1_fp8_mi300x_docker.sh | 27 ++++++++++++++++- benchmarks/dsr1_fp8_mi300x_slurm.sh | 18 ++++++------ benchmarks/gptoss_fp4_b200_docker.sh | 3 +- benchmarks/gptoss_fp4_mi300x_docker.sh | 26 ++++++++++++++++- benchmarks/gptoss_fp4_mi300x_slurm.sh | 19 ++++++------ runners/launch_b200-nvd.sh | 6 ++-- runners/launch_mi300x-amd.sh | 40 ++------------------------ 7 files changed, 78 insertions(+), 61 deletions(-) diff --git a/benchmarks/dsr1_fp8_mi300x_docker.sh b/benchmarks/dsr1_fp8_mi300x_docker.sh index fca44bcf1..82cb4fbee 100644 --- a/benchmarks/dsr1_fp8_mi300x_docker.sh +++ b/benchmarks/dsr1_fp8_mi300x_docker.sh @@ -24,6 +24,8 @@ fi export SGLANG_USE_AITER=1 +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) + set -x python3 -m sglang.launch_server \ --model-path=$MODEL --host=0.0.0.0 --port=$PORT --trust-remote-code \ @@ -33,4 +35,27 @@ python3 -m sglang.launch_server \ --chunked-prefill-size=196608 \ --num-continuous-decode-steps=4 \ --max-prefill-tokens=196608 \ ---disable-radix-cache +--disable-radix-cache > $SERVER_LOG 2>&1 & + + +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! +set +x +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID + +set -x +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ +--model=$MODEL --backend=vllm --base-url=http://$server_name:$PORT \ +--dataset-name=random \ +--random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ +--num-prompts=$(( $CONC * 10 )) \ +--max-concurrency=$CONC \ +--request-rate=inf --ignore-eos \ +--save-result --percentile-metrics="ttft,tpot,itl,e2el" \ +--result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json diff --git a/benchmarks/dsr1_fp8_mi300x_slurm.sh b/benchmarks/dsr1_fp8_mi300x_slurm.sh index 90babeaee..31fe1bf55 100644 --- a/benchmarks/dsr1_fp8_mi300x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi300x_slurm.sh @@ -47,17 +47,19 @@ python3 -m sglang.launch_server \ --disable-radix-cache \ > $SERVER_LOG 2>&1 & +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"The server is fired up and ready to roll"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model=$MODEL --backend=vllm \ --base-url="http://0.0.0.0:$PORT" \ --dataset-name=random \ diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh index 530e61373..ac9aefefe 100644 --- a/benchmarks/gptoss_fp4_b200_docker.sh +++ b/benchmarks/gptoss_fp4_b200_docker.sh @@ -43,6 +43,7 @@ export TORCH_CUDA_ARCH_LIST="10.0" export VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB='{"2":32,"4":32,"8":8}' export PYTHONNOUSERSITE=1 export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1 + SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) set -x @@ -66,7 +67,7 @@ set -x BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model $MODEL --backend vllm --base-url http://localhost:$PORT \ +--model $MODEL --backend vllm --base-url http://localhost:$PORT \ --dataset-name random \ --random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ --num-prompts $NUM_PROMPTS \ diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh index 66a8642bd..32efdf0fe 100644 --- a/benchmarks/gptoss_fp4_mi300x_docker.sh +++ b/benchmarks/gptoss_fp4_mi300x_docker.sh @@ -24,6 +24,8 @@ export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) + set -x vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ @@ -34,4 +36,26 @@ vllm serve $MODEL --port $PORT \ --block-size=64 \ --no-enable-prefix-caching \ --disable-log-requests \ ---async-scheduling +--async-scheduling > $SERVER_LOG 2>&1 & + +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! +set +x +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID + +set -x +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ +--model=$MODEL --backend=vllm --base-url=http://$server_name:$PORT \ +--dataset-name=random \ +--random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ +--num-prompts=$(( $CONC * 10 )) \ +--max-concurrency=$CONC \ +--request-rate=inf --ignore-eos \ +--save-result --percentile-metrics="ttft,tpot,itl,e2el" \ +--result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json diff --git a/benchmarks/gptoss_fp4_mi300x_slurm.sh b/benchmarks/gptoss_fp4_mi300x_slurm.sh index 0ab5a250f..0e4a0b3b2 100644 --- a/benchmarks/gptoss_fp4_mi300x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi300x_slurm.sh @@ -48,17 +48,18 @@ vllm serve $MODEL --port $PORT \ --async-scheduling \ > $SERVER_LOG 2>&1 & +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model $MODEL --backend vllm \ --base-url http://0.0.0.0:$PORT \ --dataset-name random \ diff --git a/runners/launch_b200-nvd.sh b/runners/launch_b200-nvd.sh index 47c7c979f..c5216b006 100644 --- a/runners/launch_b200-nvd.sh +++ b/runners/launch_b200-nvd.sh @@ -27,12 +27,12 @@ set -x if [[ "$MODEL" == "nvidia/DeepSeek-R1-0528-FP4" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then if [[ "$OSL" == "8192" ]]; then - NUM_PROMPTS=$(( CONC * 20 )) + export NUM_PROMPTS=$(( CONC * 20 )) else - NUM_PROMPTS=$(( CONC * 50 )) + export NUM_PROMPTS=$(( CONC * 50 )) fi else - NUM_PROMPTS=$(( CONC * 10 )) + export NUM_PROMPTS=$(( CONC * 10 )) fi docker run --rm --init --network host --name $server_name \ diff --git a/runners/launch_mi300x-amd.sh b/runners/launch_mi300x-amd.sh index 51e059d4c..1f77a1ede 100644 --- a/runners/launch_mi300x-amd.sh +++ b/runners/launch_mi300x-amd.sh @@ -5,52 +5,16 @@ sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing' HF_HUB_CACHE_MOUNT="/shareddata/hf_hub_cache_$(hostname)/" PORT=8888 -network_name="bmk-net" server_name="bmk-server" -client_name="bmk-client" - -docker network create $network_name set -x -docker run --rm -d --ipc=host --shm-size=16g --network=$network_name --name=$server_name \ +docker run --rm --ipc=host --shm-size=16g --name=$server_name \ --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \ --e ISL -e OSL \ +-e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \ --entrypoint=/bin/bash \ $IMAGE \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi300x_docker.sh" - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" =~ Application\ startup\ complete ]]; then - break - fi -done < <(docker logs -f --tail=0 $server_name 2>&1) - -git clone https://github.com/kimbochen/bench_serving.git - -set -x -docker run --rm --network=$network_name --name=$client_name \ --v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ ---entrypoint=python3 \ -$IMAGE \ -bench_serving/benchmark_serving.py \ ---model=$MODEL --backend=vllm --base-url=http://$server_name:$PORT \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$(( $CONC * 10 )) \ ---max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics="ttft,tpot,itl,e2el" \ ---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json - -while [ -n "$(docker ps -aq)" ]; do - docker stop $server_name - docker network rm $network_name - sleep 5 -done From a692940723852a80ae9873628cb6795d485a5397 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 18:22:42 -0600 Subject: [PATCH 28/60] updating mi300 pt 2 --- runners/launch_mi300x-amd.sh | 2 +- runners/launch_mi300x-cr.sh | 40 ++---------------------------------- runners/launch_mi300x-oci.sh | 40 ++---------------------------------- 3 files changed, 5 insertions(+), 77 deletions(-) diff --git a/runners/launch_mi300x-amd.sh b/runners/launch_mi300x-amd.sh index 1f77a1ede..780e5a2f0 100644 --- a/runners/launch_mi300x-amd.sh +++ b/runners/launch_mi300x-amd.sh @@ -8,7 +8,7 @@ PORT=8888 server_name="bmk-server" set -x -docker run --rm --ipc=host --shm-size=16g --name=$server_name \ +docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ diff --git a/runners/launch_mi300x-cr.sh b/runners/launch_mi300x-cr.sh index 48be17610..bdcc9e422 100644 --- a/runners/launch_mi300x-cr.sh +++ b/runners/launch_mi300x-cr.sh @@ -5,52 +5,16 @@ sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing' HF_HUB_CACHE_MOUNT="/mnt/vdb/gha_cache/hf_hub_cache/" PORT=8888 -network_name="bmk-net" server_name="bmk-server" -client_name="bmk-client" - -docker network create $network_name set -x -docker run --rm -d --ipc=host --shm-size=16g --network=$network_name --name=$server_name \ +docker run --rm -d --ipc=host --shm-size=16g --network=host --name=$server_name \ --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \ --e ISL -e OSL \ +-e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \ --entrypoint=/bin/bash \ $IMAGE \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi300x_docker.sh" - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" =~ Application\ startup\ complete ]]; then - break - fi -done < <(docker logs -f --tail=0 $server_name 2>&1) - -git clone https://github.com/kimbochen/bench_serving.git - -set -x -docker run --rm --network=$network_name --name=$client_name \ --v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ ---entrypoint=python3 \ -$IMAGE \ -bench_serving/benchmark_serving.py \ ---model=$MODEL --backend=vllm --base-url=http://$server_name:$PORT \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$(( $CONC * 10 )) \ ---max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics="ttft,tpot,itl,e2el" \ ---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json - -while [ -n "$(docker ps -aq)" ]; do - docker stop $server_name - docker network rm $network_name - sleep 5 -done diff --git a/runners/launch_mi300x-oci.sh b/runners/launch_mi300x-oci.sh index 60cf9c238..2018cbc94 100644 --- a/runners/launch_mi300x-oci.sh +++ b/runners/launch_mi300x-oci.sh @@ -3,52 +3,16 @@ HF_HUB_CACHE_MOUNT="$HOME/hf_hub_cache/" PORT=8888 -network_name="bmk-net" server_name="bmk-server" -client_name="bmk-client" - -docker network create $network_name set -x -docker run --rm -d --ipc=host --shm-size=16g --network=$network_name --name=$server_name \ +docker run --rm -d --ipc=host --shm-size=16g --network=host --name=$server_name \ --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \ --e ISL -e OSL \ +-e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \ --entrypoint=/bin/bash \ $IMAGE \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi300x_docker.sh" - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" =~ Application\ startup\ complete ]]; then - break - fi -done < <(docker logs -f --tail=0 $server_name 2>&1) - -git clone https://github.com/kimbochen/bench_serving.git - -set -x -docker run --rm --network=$network_name --name=$client_name \ --v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ ---entrypoint=python3 \ -$IMAGE \ -bench_serving/benchmark_serving.py \ ---model=$MODEL --backend=vllm --base-url=http://$server_name:$PORT \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$(( $CONC * 10 )) \ ---max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics="ttft,tpot,itl,e2el" \ ---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json - -while [ -n "$(docker ps -aq)" ]; do - docker stop $server_name - docker network rm $network_name - sleep 5 -done From 6869b369e1ac3db24fb1f14bee38d122c795eae4 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 18:24:03 -0600 Subject: [PATCH 29/60] updating mi300 pt 3 -- remove detached mode --- runners/launch_mi300x-cr.sh | 2 +- runners/launch_mi300x-oci.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/runners/launch_mi300x-cr.sh b/runners/launch_mi300x-cr.sh index bdcc9e422..8fbdaee63 100644 --- a/runners/launch_mi300x-cr.sh +++ b/runners/launch_mi300x-cr.sh @@ -8,7 +8,7 @@ PORT=8888 server_name="bmk-server" set -x -docker run --rm -d --ipc=host --shm-size=16g --network=host --name=$server_name \ +docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ diff --git a/runners/launch_mi300x-oci.sh b/runners/launch_mi300x-oci.sh index 2018cbc94..33614a03c 100644 --- a/runners/launch_mi300x-oci.sh +++ b/runners/launch_mi300x-oci.sh @@ -6,7 +6,7 @@ PORT=8888 server_name="bmk-server" set -x -docker run --rm -d --ipc=host --shm-size=16g --network=host --name=$server_name \ +docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ From b8b6b465047c81561ceb79796e1ece267551b99c Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 18:27:48 -0600 Subject: [PATCH 30/60] cleaning up mi355x --- benchmarks/gptoss_fp4_mi355x_docker.sh | 14 +++++--- runners/launch_mi355x-amd.sh | 47 -------------------------- 2 files changed, 9 insertions(+), 52 deletions(-) diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh index 533f5e212..8209857bd 100644 --- a/benchmarks/gptoss_fp4_mi355x_docker.sh +++ b/benchmarks/gptoss_fp4_mi355x_docker.sh @@ -22,6 +22,8 @@ export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_FUSED_MOE_A16W4=1 +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) + set -x vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ @@ -32,18 +34,20 @@ vllm serve $MODEL --port $PORT \ --block-size=64 \ --no-enable-prefix-caching \ --disable-log-requests \ ---async-scheduling | tee $(mktemp /tmp/server-XXXXXX.log) & +--async-scheduling > $SERVER_LOG 2>&1 & -# Show server logs til' it is up, then stop showing +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! set +x -until curl --output /dev/null --silent --fail http://localhost:$PORT/health; do +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do sleep 5 done -pkill -P $$ tee 2>/dev/null +kill $TAIL_PID +set -x BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -set -x python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model=$MODEL --backend=vllm --base-url="http://localhost:$PORT" \ --dataset-name=random \ diff --git a/runners/launch_mi355x-amd.sh b/runners/launch_mi355x-amd.sh index 009a53108..e77daf5c2 100644 --- a/runners/launch_mi355x-amd.sh +++ b/runners/launch_mi355x-amd.sh @@ -17,11 +17,7 @@ HF_HUB_CACHE_MOUNT="/nfsdata/hf_hub_cache-1/" # Temp solution PORT=8888 -# network_name="bmk-net" server_name="bmk-server" -# client_name="bmk-client" - -# docker network create $network_name set -x docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ @@ -35,50 +31,7 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ $IMAGE \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi355x_docker.sh" -# set +x -# while IFS= read -r line; do -# printf '%s\n' "$line" -# if [[ "$line" =~ Application\ startup\ complete ]]; then -# break -# fi -# done < <(docker logs -f --tail=0 $server_name 2>&1) - -# if [[ "$MODEL" == "amd/DeepSeek-R1-0528-MXFP4-Preview" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then -# if [[ "$OSL" == "8192" ]]; then -# NUM_PROMPTS=$(( CONC * 20 )) -# else -# NUM_PROMPTS=$(( CONC * 50 )) -# fi -# else -# NUM_PROMPTS=$(( CONC * 10 )) -# fi - -# git clone https://github.com/kimbochen/bench_serving.git - -# set -x -# docker run --rm --network=$network_name --name=$client_name \ -# -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -# -e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ -# --entrypoint=python3 \ -# $IMAGE \ -# bench_serving/benchmark_serving.py \ -# --model=$MODEL --backend=vllm --base-url="http://$server_name:$PORT" \ -# --dataset-name=random \ -# --random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ -# --num-prompts=$NUM_PROMPTS \ -# --max-concurrency=$CONC \ -# --request-rate=inf --ignore-eos \ -# --save-result --percentile-metrics="ttft,tpot,itl,e2el" \ -# --result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json - if ls gpucore.* 1> /dev/null 2>&1; then echo "gpucore files exist. not good" rm -f gpucore.* fi - - -# while [ -n "$(docker ps -aq)" ]; do -# docker stop $server_name -# # docker network rm $network_name -# sleep 5 -# done From df64d8f753ed7c1d9f3baa9d84bec0dc3e124d0b Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 18:33:16 -0600 Subject: [PATCH 31/60] fixing mi300x and updating 325x --- benchmarks/dsr1_fp8_mi325x_docker.sh | 26 +++++++++++++++++++++++++- benchmarks/dsr1_fp8_mi325x_slurm.sh | 18 ++++++++++-------- benchmarks/gptoss_fp4_mi300x_docker.sh | 4 ++-- benchmarks/gptoss_fp4_mi325x_docker.sh | 26 +++++++++++++++++++++++++- benchmarks/gptoss_fp4_mi325x_slurm.sh | 18 ++++++++++-------- 5 files changed, 72 insertions(+), 20 deletions(-) diff --git a/benchmarks/dsr1_fp8_mi325x_docker.sh b/benchmarks/dsr1_fp8_mi325x_docker.sh index f39a8dbbd..41f77ebd3 100644 --- a/benchmarks/dsr1_fp8_mi325x_docker.sh +++ b/benchmarks/dsr1_fp8_mi325x_docker.sh @@ -14,6 +14,8 @@ export SGLANG_USE_AITER=1 +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) + python3 -m sglang.launch_server \ --model-path $MODEL \ --host=0.0.0.0 \ @@ -24,5 +26,27 @@ python3 -m sglang.launch_server \ --mem-fraction-static 0.8 --disable-radix-cache \ --num-continuous-decode-steps 4 \ --max-prefill-tokens 196608 \ - --cuda-graph-max-bs 128 + --cuda-graph-max-bs 128 > $SERVER_LOG 2>&1 & + +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! +set +x +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID + +set -x +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ +--model=$MODEL --backend=vllm --base-url=http://0.0.0.0:$PORT \ +--dataset-name=random \ +--random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ +--num-prompts=$(( $CONC * 10 )) \ +--max-concurrency=$CONC \ +--request-rate=inf --ignore-eos \ +--save-result --percentile-metrics="ttft,tpot,itl,e2el" \ +--result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json diff --git a/benchmarks/dsr1_fp8_mi325x_slurm.sh b/benchmarks/dsr1_fp8_mi325x_slurm.sh index 09dae4dbb..f9da69095 100644 --- a/benchmarks/dsr1_fp8_mi325x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi325x_slurm.sh @@ -23,17 +23,19 @@ python3 -m sglang.launch_server \ --disable-radix-cache \ > $SERVER_LOG 2>&1 & +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"The server is fired up and ready to roll"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model $MODEL --backend vllm \ --base-url http://0.0.0.0:$PORT \ --dataset-name random \ diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh index 32efdf0fe..0b03900be 100644 --- a/benchmarks/gptoss_fp4_mi300x_docker.sh +++ b/benchmarks/gptoss_fp4_mi300x_docker.sh @@ -51,11 +51,11 @@ set -x BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model=$MODEL --backend=vllm --base-url=http://$server_name:$PORT \ +--model=$MODEL --backend=vllm --base-url=http://0.0.0.0:$PORT \ --dataset-name=random \ --random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ --num-prompts=$(( $CONC * 10 )) \ --max-concurrency=$CONC \ --request-rate=inf --ignore-eos \ --save-result --percentile-metrics="ttft,tpot,itl,e2el" \ ---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json +--result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_mi325x_docker.sh b/benchmarks/gptoss_fp4_mi325x_docker.sh index 05250267f..c57446da3 100644 --- a/benchmarks/gptoss_fp4_mi325x_docker.sh +++ b/benchmarks/gptoss_fp4_mi325x_docker.sh @@ -23,6 +23,8 @@ export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) + set -x vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ @@ -33,4 +35,26 @@ vllm serve $MODEL --port $PORT \ --block-size=64 \ --no-enable-prefix-caching \ --disable-log-requests \ ---async-scheduling +--async-scheduling > $SERVER_LOG 2>&1 & + +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! +set +x +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID + +set -x +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ +--model=$MODEL --backend=vllm --base-url=http://0.0.0.0:$PORT \ +--dataset-name=random \ +--random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ +--num-prompts=$(( $CONC * 10 )) \ +--max-concurrency=$CONC \ +--request-rate=inf --ignore-eos \ +--save-result --percentile-metrics="ttft,tpot,itl,e2el" \ +--result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh index cab549cbc..9cbef3276 100644 --- a/benchmarks/gptoss_fp4_mi325x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh @@ -48,17 +48,19 @@ vllm serve $MODEL --port $PORT \ --async-scheduling \ > $SERVER_LOG 2>&1 & +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model $MODEL --backend vllm \ --base-url http://0.0.0.0:$PORT \ --dataset-name random \ From def851c32fa32721a6dc98ef85b537b6599ca0a4 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 18:38:43 -0600 Subject: [PATCH 32/60] reverting max conc to 512 on gptoss fp4 b200 docker --- benchmarks/gptoss_fp4_h100_docker.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index ae889474c..2cec8a165 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -51,7 +51,7 @@ python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --base-url=http://localhost:$PORT \ --dataset-name=random \ --random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \ +--num-prompts=$(( $CONC * 10 )) --max-concurrency=512 \ --request-rate=inf --ignore-eos \ --save-result --percentile-metrics='ttft,tpot,itl,e2el' \ --result-dir=/workspace/ \ From ad97a0bba79c1d85ec7601765f51cf43bfa44f1f Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 18:33:16 -0600 Subject: [PATCH 33/60] fixing mi300x and updating 325x --- runners/launch_mi325x-amd.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh index b622ee2e8..1065167d7 100644 --- a/runners/launch_mi325x-amd.sh +++ b/runners/launch_mi325x-amd.sh @@ -21,4 +21,4 @@ srun --jobid=$JOB_ID \ --no-container-entrypoint --export=ALL \ bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_mi325x_slurm.sh -scancel $JOB_ID \ No newline at end of file +scancel $JOB_ID From 284c11f1cc694c15d41e3ce68c69022be8061758 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 14 Nov 2025 09:04:15 -0600 Subject: [PATCH 34/60] cleanng up --- benchmarks/dsr1_fp4_mi355x_docker.sh | 27 ++++++++++++++++++++++++- benchmarks/dsr1_fp4_mi355x_slurm.sh | 13 +++--------- benchmarks/dsr1_fp8_h200_slurm.sh | 13 +++--------- benchmarks/dsr1_fp8_mi355x_docker.sh | 13 ++++++++---- benchmarks/dsr1_fp8_mi355x_slurm.sh | 18 +++++++++-------- benchmarks/gptoss_fp4_b200_trt_slurm.sh | 20 +++++++++--------- benchmarks/gptoss_fp4_h200_slurm.sh | 18 +++++++++-------- 7 files changed, 72 insertions(+), 50 deletions(-) diff --git a/benchmarks/dsr1_fp4_mi355x_docker.sh b/benchmarks/dsr1_fp4_mi355x_docker.sh index 4d3ed084c..72c4e4778 100644 --- a/benchmarks/dsr1_fp4_mi355x_docker.sh +++ b/benchmarks/dsr1_fp4_mi355x_docker.sh @@ -18,6 +18,8 @@ if [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then fi fi +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) + set -x python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \ --host=0.0.0.0 --port=$PORT \ @@ -27,5 +29,28 @@ python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \ --disable-radix-cache \ --num-continuous-decode-steps=4 \ --max-prefill-tokens=$PREFILL_SIZE \ ---cuda-graph-max-bs=128 +--cuda-graph-max-bs=128 > $SERVER_LOG 2>&1 & + +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! +set +x +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID + +set -x +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ +--model=$MODEL --backend=vllm --base-url="http://localhost:$PORT" \ +--dataset-name=random \ +--random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ +--num-prompts=$NUM_PROMPTS \ +--max-concurrency=$CONC \ +--request-rate=inf --ignore-eos \ +--save-result --percentile-metrics="ttft,tpot,itl,e2el" \ +--result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json + diff --git a/benchmarks/dsr1_fp4_mi355x_slurm.sh b/benchmarks/dsr1_fp4_mi355x_slurm.sh index b88a90f46..ffd2883fd 100644 --- a/benchmarks/dsr1_fp4_mi355x_slurm.sh +++ b/benchmarks/dsr1_fp4_mi355x_slurm.sh @@ -34,17 +34,10 @@ python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \ --cuda-graph-max-bs=128 \ > $SERVER_LOG 2>&1 & -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"The server is fired up and ready to roll"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model $MODEL --backend vllm \ --base-url "http://0.0.0.0:$PORT" \ --dataset-name random \ diff --git a/benchmarks/dsr1_fp8_h200_slurm.sh b/benchmarks/dsr1_fp8_h200_slurm.sh index 86ea0024f..2298b5486 100644 --- a/benchmarks/dsr1_fp8_h200_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_slurm.sh @@ -44,17 +44,10 @@ else > $SERVER_LOG 2>&1 & fi -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model $MODEL --backend vllm \ --base-url http://0.0.0.0:$PORT \ --dataset-name random \ diff --git a/benchmarks/dsr1_fp8_mi355x_docker.sh b/benchmarks/dsr1_fp8_mi355x_docker.sh index baad70fd8..2ee734495 100644 --- a/benchmarks/dsr1_fp8_mi355x_docker.sh +++ b/benchmarks/dsr1_fp8_mi355x_docker.sh @@ -14,6 +14,8 @@ export SGLANG_USE_AITER=1 +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) + python3 -m sglang.launch_server \ --model-path $MODEL \ --host=0.0.0.0 \ @@ -24,13 +26,16 @@ python3 -m sglang.launch_server \ --mem-fraction-static 0.8 --disable-radix-cache \ --num-continuous-decode-steps 4 \ --max-prefill-tokens 196608 \ - --cuda-graph-max-bs 128 | tee $(mktemp /tmp/server-XXXXXX.log) & + --cuda-graph-max-bs 128 > $SERVER_LOG 2>&1 & +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! set +x -until curl --output /dev/null --silent --fail http://localhost:$PORT/health; do +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do sleep 5 done -pkill -P $$ tee 2>/dev/null +kill $TAIL_PID if [[ "$MODEL" == "amd/DeepSeek-R1-0528-MXFP4-Preview" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then if [[ "$OSL" == "8192" ]]; then @@ -42,9 +47,9 @@ else NUM_PROMPTS=$(( CONC * 10 )) fi +set -x BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -set -x python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model=$MODEL --backend=vllm --base-url="http://localhost:$PORT" \ --dataset-name=random \ diff --git a/benchmarks/dsr1_fp8_mi355x_slurm.sh b/benchmarks/dsr1_fp8_mi355x_slurm.sh index bf5d60e9c..0bdc36024 100644 --- a/benchmarks/dsr1_fp8_mi355x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi355x_slurm.sh @@ -32,17 +32,19 @@ python3 -m sglang.launch_server \ --max-prefill-tokens 196608 \ --cuda-graph-max-bs 128 > $SERVER_LOG 2>&1 & +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"The server is fired up and ready to roll"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model $MODEL --backend vllm \ --base-url "http://0.0.0.0:$PORT" \ --dataset-name random \ diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh index 349930dfb..92477dd56 100644 --- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh @@ -79,17 +79,19 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \ > $SERVER_LOG 2>&1 & +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - -git clone https://github.com/kimbochen/bench_serving.git +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID + set -x -python3 bench_serving/benchmark_serving.py \ +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model $MODEL --backend openai \ --base-url http://0.0.0.0:$PORT \ --dataset-name random \ diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh index f92c60425..37851d39c 100644 --- a/benchmarks/gptoss_fp4_h200_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_slurm.sh @@ -48,17 +48,19 @@ PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC \ --disable-log-requests > $SERVER_LOG 2>&1 & +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model $MODEL --backend vllm \ --base-url http://0.0.0.0:$PORT \ --dataset-name random \ From 5ca8d5bb18acee252091335e75ec34268889413e Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 14 Nov 2025 09:09:20 -0600 Subject: [PATCH 35/60] add wait for h200 slurm dsr1 --- benchmarks/dsr1_fp8_h200_slurm.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/benchmarks/dsr1_fp8_h200_slurm.sh b/benchmarks/dsr1_fp8_h200_slurm.sh index 2298b5486..14e3c2a7b 100644 --- a/benchmarks/dsr1_fp8_h200_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_slurm.sh @@ -44,6 +44,15 @@ else > $SERVER_LOG 2>&1 & fi +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! +set +x +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID + set -x BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR From 6d300f4baf800c65cff0b96a7ff28c8828f6d814 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 14 Nov 2025 09:20:27 -0600 Subject: [PATCH 36/60] max num seqs back to 512 for gptoss fpr b200 docker --- benchmarks/gptoss_fp4_b200_docker.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh index ac9aefefe..878564e11 100644 --- a/benchmarks/gptoss_fp4_b200_docker.sh +++ b/benchmarks/gptoss_fp4_b200_docker.sh @@ -48,7 +48,7 @@ SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) set -x vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \ ---gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC \ +--gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs 512 \ --disable-log-requests > $SERVER_LOG 2>&1 & SERVER_PID=$! From b026d28607861675b90d943e35328f624d17cb9e Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 14 Nov 2025 09:40:10 -0600 Subject: [PATCH 37/60] fix port issue for dsr1 mi300x docker --- benchmarks/dsr1_fp8_mi300x_docker.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/dsr1_fp8_mi300x_docker.sh b/benchmarks/dsr1_fp8_mi300x_docker.sh index 82cb4fbee..033b84ed0 100644 --- a/benchmarks/dsr1_fp8_mi300x_docker.sh +++ b/benchmarks/dsr1_fp8_mi300x_docker.sh @@ -51,7 +51,7 @@ set -x BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model=$MODEL --backend=vllm --base-url=http://$server_name:$PORT \ +--model=$MODEL --backend=vllm --base-url=http://0.0.0.0:$PORT \ --dataset-name=random \ --random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ --num-prompts=$(( $CONC * 10 )) \ From 9e38f87d06230b97bbaf797562db4ff86a8259ea Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 14 Nov 2025 11:50:53 -0600 Subject: [PATCH 38/60] fix mi355x docker NUM_PROMPTS --- runners/launch_mi355x-amd.sh | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/runners/launch_mi355x-amd.sh b/runners/launch_mi355x-amd.sh index e77daf5c2..5f3cbb290 100644 --- a/runners/launch_mi355x-amd.sh +++ b/runners/launch_mi355x-amd.sh @@ -19,13 +19,23 @@ PORT=8888 server_name="bmk-server" +if [[ "$MODEL" == "amd/DeepSeek-R1-0528-MXFP4-Preview" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then + if [[ "$OSL" == "8192" ]]; then + export NUM_PROMPTS=$(( CONC * 20 )) + else + export NUM_PROMPTS=$(( CONC * 50 )) + fi +else + export NUM_PROMPTS=$(( CONC * 10 )) +fi + set -x docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \ +-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT -e NUM_PROMPTS \ -e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \ --entrypoint=/bin/bash \ $IMAGE \ From ee2105b7d433540968243360074e35b1ef783d9f Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 14 Nov 2025 15:46:51 -0600 Subject: [PATCH 39/60] adding prop of failure for server logs --- benchmarks/dsr1_fp4_mi355x_slurm.sh | 9 +++++++++ benchmarks/dsr1_fp8_h200_trt_slurm.sh | 2 +- benchmarks/gptoss_fp4_h100_docker.sh | 6 ++++++ runners/launch_b200-nb.sh | 2 ++ 4 files changed, 18 insertions(+), 1 deletion(-) diff --git a/benchmarks/dsr1_fp4_mi355x_slurm.sh b/benchmarks/dsr1_fp4_mi355x_slurm.sh index ffd2883fd..cad5efdc5 100644 --- a/benchmarks/dsr1_fp4_mi355x_slurm.sh +++ b/benchmarks/dsr1_fp4_mi355x_slurm.sh @@ -34,6 +34,15 @@ python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \ --cuda-graph-max-bs=128 \ > $SERVER_LOG 2>&1 & +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! +set +x +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID + set -x BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh index a44769bc6..15647bbd2 100644 --- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh @@ -69,9 +69,9 @@ PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \ --tp_size=$TP --ep_size=$EP_SIZE \ --extra_llm_api_options=$EXTRA_CONFIG_FILE \ > $SERVER_LOG 2>&1 & + SERVER_PID=$! - # Show logs until server is ready tail -f $SERVER_LOG & TAIL_PID=$! diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index 2cec8a165..6b95fae1a 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -32,11 +32,17 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --max-num-seqs=$CONC \ --disable-log-requests > $SERVER_LOG 2>&1 & +SERVER_PID=$! + # Show logs until server is ready tail -f $SERVER_LOG & TAIL_PID=$! set +x until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + if ! kill -0 $SERVER_PID 2>/dev/null; then + echo "Server died before becoming healthy. Exiting." + exit 1 + fi sleep 5 done kill $TAIL_PID diff --git a/runners/launch_b200-nb.sh b/runners/launch_b200-nb.sh index 9a3dfa909..ecd1466dd 100644 --- a/runners/launch_b200-nb.sh +++ b/runners/launch_b200-nb.sh @@ -13,3 +13,5 @@ srun --partition=$PARTITION --gres=gpu:$TP --exclusive \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT_OFFSET=${USER: -1} \ bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_slurm.sh + +scancel $JOB_ID \ No newline at end of file From 9dd3e1a1a5d0abeee8e0122ccc3c4b23b2b13c74 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 14 Nov 2025 16:05:09 -0600 Subject: [PATCH 40/60] add utils function for benchmark --- benchmarks/benchmark_lib.sh | 143 ++++++++++++++++++++++++ benchmarks/dsr1_fp4_b200_docker.sh | 26 +++-- benchmarks/dsr1_fp4_b200_trt_slurm.sh | 26 +++-- benchmarks/dsr1_fp4_mi355x_docker.sh | 25 +++-- benchmarks/dsr1_fp4_mi355x_slurm.sh | 25 +++-- benchmarks/dsr1_fp8_b200_docker.sh | 26 +++-- benchmarks/dsr1_fp8_b200_trt_slurm.sh | 26 +++-- benchmarks/dsr1_fp8_h200_slurm.sh | 26 +++-- benchmarks/dsr1_fp8_h200_trt_slurm.sh | 26 +++-- benchmarks/dsr1_fp8_mi300x_docker.sh | 25 +++-- benchmarks/dsr1_fp8_mi300x_slurm.sh | 26 +++-- benchmarks/dsr1_fp8_mi325x_docker.sh | 25 +++-- benchmarks/dsr1_fp8_mi325x_slurm.sh | 26 +++-- benchmarks/dsr1_fp8_mi355x_docker.sh | 25 +++-- benchmarks/dsr1_fp8_mi355x_slurm.sh | 25 +++-- benchmarks/gptoss_fp4_b200_docker.sh | 26 +++-- benchmarks/gptoss_fp4_b200_trt_slurm.sh | 26 +++-- benchmarks/gptoss_fp4_h100_docker.sh | 28 ++--- benchmarks/gptoss_fp4_h100_slurm.sh | 28 ++--- benchmarks/gptoss_fp4_h200_slurm.sh | 26 +++-- benchmarks/gptoss_fp4_h200_trt_slurm.sh | 26 +++-- benchmarks/gptoss_fp4_mi300x_docker.sh | 25 +++-- benchmarks/gptoss_fp4_mi300x_slurm.sh | 26 +++-- benchmarks/gptoss_fp4_mi325x_docker.sh | 25 +++-- benchmarks/gptoss_fp4_mi325x_slurm.sh | 26 +++-- benchmarks/gptoss_fp4_mi355x_docker.sh | 25 +++-- benchmarks/gptoss_fp4_mi355x_slurm.sh | 25 +++-- 27 files changed, 512 insertions(+), 301 deletions(-) create mode 100644 benchmarks/benchmark_lib.sh diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh new file mode 100644 index 000000000..152b5e4b6 --- /dev/null +++ b/benchmarks/benchmark_lib.sh @@ -0,0 +1,143 @@ +#!/usr/bin/env bash + +# Shared benchmarking utilities for InferenceMAX + +# Run benchmark serving with standardized parameters +# All parameters are required +# Parameters: +# --model: Model name +# --port: Server port +# --backend: Backend type - 'vllm' or 'openai' +# --input-len: Random input sequence length +# --output-len: Random output sequence length +# --random-range-ratio: Random range ratio +# --num-prompts: Number of prompts +# --max-concurrency: Max concurrency +# --result-filename: Result filename without extension +# --result-dir: Result directory +run_benchmark_serving() { + local model="" + local port="" + local backend="" + local input_len="" + local output_len="" + local random_range_ratio="" + local num_prompts="" + local max_concurrency="" + local result_filename="" + local result_dir="" + + # Parse arguments + while [[ $# -gt 0 ]]; do + case $1 in + --model) + model="$2" + shift 2 + ;; + --port) + port="$2" + shift 2 + ;; + --backend) + backend="$2" + shift 2 + ;; + --input-len) + input_len="$2" + shift 2 + ;; + --output-len) + output_len="$2" + shift 2 + ;; + --random-range-ratio) + random_range_ratio="$2" + shift 2 + ;; + --num-prompts) + num_prompts="$2" + shift 2 + ;; + --max-concurrency) + max_concurrency="$2" + shift 2 + ;; + --result-filename) + result_filename="$2" + shift 2 + ;; + --result-dir) + result_dir="$2" + shift 2 + ;; + *) + echo "Unknown parameter: $1" + return 1 + ;; + esac + done + + # Validate all required parameters + if [[ -z "$model" ]]; then + echo "Error: --model is required" + return 1 + fi + if [[ -z "$port" ]]; then + echo "Error: --port is required" + return 1 + fi + if [[ -z "$backend" ]]; then + echo "Error: --backend is required" + return 1 + fi + if [[ -z "$input_len" ]]; then + echo "Error: --input-len is required" + return 1 + fi + if [[ -z "$output_len" ]]; then + echo "Error: --output-len is required" + return 1 + fi + if [[ -z "$random_range_ratio" ]]; then + echo "Error: --random-range-ratio is required" + return 1 + fi + if [[ -z "$num_prompts" ]]; then + echo "Error: --num-prompts is required" + return 1 + fi + if [[ -z "$max_concurrency" ]]; then + echo "Error: --max-concurrency is required" + return 1 + fi + if [[ -z "$result_filename" ]]; then + echo "Error: --result-filename is required" + return 1 + fi + if [[ -z "$result_dir" ]]; then + echo "Error: --result-dir is required" + return 1 + fi + + # Clone benchmark serving repo + local BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) + git clone https://github.com/kimbochen/bench_serving.git "$BENCH_SERVING_DIR" + + # Run benchmark + python3 "$BENCH_SERVING_DIR/benchmark_serving.py" \ + --model "$model" \ + --backend "$backend" \ + --base-url "http://0.0.0.0:$port" \ + --dataset-name random \ + --random-input-len "$input_len" \ + --random-output-len "$output_len" \ + --random-range-ratio "$random_range_ratio" \ + --num-prompts "$num_prompts" \ + --max-concurrency "$max_concurrency" \ + --request-rate inf \ + --ignore-eos \ + --save-result \ + --percentile-metrics 'ttft,tpot,itl,e2el' \ + --result-dir "$result_dir" \ + --result-filename "$result_filename.json" +} diff --git a/benchmarks/dsr1_fp4_b200_docker.sh b/benchmarks/dsr1_fp4_b200_docker.sh index 8b9f116c6..317015ba8 100644 --- a/benchmarks/dsr1_fp4_b200_docker.sh +++ b/benchmarks/dsr1_fp4_b200_docker.sh @@ -36,16 +36,20 @@ done kill $TAIL_PID pip install -q datasets pandas + +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model $MODEL --backend vllm --base-url http://localhost:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $NUM_PROMPTS \ ---max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ --result-filename $RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$NUM_PROMPTS" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh index 6896880fb..897ef8527 100644 --- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh @@ -110,16 +110,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model $MODEL --backend openai \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend openai \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp4_mi355x_docker.sh b/benchmarks/dsr1_fp4_mi355x_docker.sh index 72c4e4778..05603ecae 100644 --- a/benchmarks/dsr1_fp4_mi355x_docker.sh +++ b/benchmarks/dsr1_fp4_mi355x_docker.sh @@ -40,17 +40,20 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model=$MODEL --backend=vllm --base-url="http://localhost:$PORT" \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$NUM_PROMPTS \ ---max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics="ttft,tpot,itl,e2el" \ ---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$NUM_PROMPTS" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp4_mi355x_slurm.sh b/benchmarks/dsr1_fp4_mi355x_slurm.sh index cad5efdc5..c47bbfb38 100644 --- a/benchmarks/dsr1_fp4_mi355x_slurm.sh +++ b/benchmarks/dsr1_fp4_mi355x_slurm.sh @@ -43,16 +43,19 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url "http://0.0.0.0:$PORT" \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics "ttft,tpot,itl,e2el" \ ---result-dir /workspace/ --result-filename $RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp8_b200_docker.sh b/benchmarks/dsr1_fp8_b200_docker.sh index f1412264c..fa498ff3e 100644 --- a/benchmarks/dsr1_fp8_b200_docker.sh +++ b/benchmarks/dsr1_fp8_b200_docker.sh @@ -47,15 +47,19 @@ done kill $TAIL_PID pip install -q datasets pandas + +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model $MODEL --backend vllm --base-url http://localhost:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $NUM_PROMPTS \ ---max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ --result-filename $RESULT_FILENAME.json \ No newline at end of file +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$NUM_PROMPTS" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh index 741ecdb92..a22536d82 100644 --- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh @@ -81,16 +81,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model $MODEL --backend openai \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend openai \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp8_h200_slurm.sh b/benchmarks/dsr1_fp8_h200_slurm.sh index 14e3c2a7b..7444f763f 100644 --- a/benchmarks/dsr1_fp8_h200_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_slurm.sh @@ -53,16 +53,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh index 15647bbd2..94baa7850 100644 --- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh @@ -85,16 +85,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model $MODEL --backend openai \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend openai \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp8_mi300x_docker.sh b/benchmarks/dsr1_fp8_mi300x_docker.sh index 033b84ed0..5ffebb941 100644 --- a/benchmarks/dsr1_fp8_mi300x_docker.sh +++ b/benchmarks/dsr1_fp8_mi300x_docker.sh @@ -47,15 +47,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model=$MODEL --backend=vllm --base-url=http://0.0.0.0:$PORT \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$(( $CONC * 10 )) \ ---max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics="ttft,tpot,itl,e2el" \ ---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp8_mi300x_slurm.sh b/benchmarks/dsr1_fp8_mi300x_slurm.sh index 31fe1bf55..ba1597982 100644 --- a/benchmarks/dsr1_fp8_mi300x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi300x_slurm.sh @@ -56,16 +56,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model=$MODEL --backend=vllm \ ---base-url="http://0.0.0.0:$PORT" \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics='ttft,tpot,itl,e2el' \ ---result-dir=/workspace/ \ ---result-filename=$RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp8_mi325x_docker.sh b/benchmarks/dsr1_fp8_mi325x_docker.sh index 41f77ebd3..c0f95846d 100644 --- a/benchmarks/dsr1_fp8_mi325x_docker.sh +++ b/benchmarks/dsr1_fp8_mi325x_docker.sh @@ -37,16 +37,19 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model=$MODEL --backend=vllm --base-url=http://0.0.0.0:$PORT \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$(( $CONC * 10 )) \ ---max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics="ttft,tpot,itl,e2el" \ ---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp8_mi325x_slurm.sh b/benchmarks/dsr1_fp8_mi325x_slurm.sh index f9da69095..1ccec681f 100644 --- a/benchmarks/dsr1_fp8_mi325x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi325x_slurm.sh @@ -32,16 +32,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp8_mi355x_docker.sh b/benchmarks/dsr1_fp8_mi355x_docker.sh index 2ee734495..50d9bb02d 100644 --- a/benchmarks/dsr1_fp8_mi355x_docker.sh +++ b/benchmarks/dsr1_fp8_mi355x_docker.sh @@ -47,18 +47,21 @@ else NUM_PROMPTS=$(( CONC * 10 )) fi +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model=$MODEL --backend=vllm --base-url="http://localhost:$PORT" \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$NUM_PROMPTS \ ---max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics="ttft,tpot,itl,e2el" \ ---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$NUM_PROMPTS" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp8_mi355x_slurm.sh b/benchmarks/dsr1_fp8_mi355x_slurm.sh index 0bdc36024..86b5f9649 100644 --- a/benchmarks/dsr1_fp8_mi355x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi355x_slurm.sh @@ -41,15 +41,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url "http://0.0.0.0:$PORT" \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics "ttft,tpot,itl,e2el" \ ---result-dir /workspace/ --result-filename $RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh index 878564e11..208ea278d 100644 --- a/benchmarks/gptoss_fp4_b200_docker.sh +++ b/benchmarks/gptoss_fp4_b200_docker.sh @@ -63,15 +63,19 @@ done kill $TAIL_PID pip install -q datasets pandas + +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model $MODEL --backend vllm --base-url http://localhost:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $NUM_PROMPTS \ ---max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ --result-filename $RESULT_FILENAME.json \ No newline at end of file +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$NUM_PROMPTS" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh index 92477dd56..4647cb346 100644 --- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh @@ -88,16 +88,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model $MODEL --backend openai \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend openai \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index 6b95fae1a..42bbf6b1a 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -48,17 +48,19 @@ done kill $TAIL_PID pip install -q datasets pandas + +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model=$MODEL \ ---backend=vllm \ ---base-url=http://localhost:$PORT \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$(( $CONC * 10 )) --max-concurrency=512 \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics='ttft,tpot,itl,e2el' \ ---result-dir=/workspace/ \ ---result-filename=$RESULT_FILENAME.json \ No newline at end of file +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency 512 \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh index d82bebf72..5f31f0abf 100644 --- a/benchmarks/gptoss_fp4_h100_slurm.sh +++ b/benchmarks/gptoss_fp4_h100_slurm.sh @@ -45,17 +45,19 @@ done kill $TAIL_PID pip install -q datasets pandas + +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model=$MODEL \ ---backend=vllm \ ---base-url="http://0.0.0.0:$PORT" \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics='ttft,tpot,itl,e2el' \ ---result-dir=/workspace/ \ ---result-filename=$RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh index 37851d39c..146ab16a5 100644 --- a/benchmarks/gptoss_fp4_h200_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_slurm.sh @@ -57,16 +57,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh index 0927a0d61..ffe6e65de 100644 --- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh @@ -68,16 +68,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model $MODEL --backend openai \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend openai \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh index 0b03900be..a3b1dc4f3 100644 --- a/benchmarks/gptoss_fp4_mi300x_docker.sh +++ b/benchmarks/gptoss_fp4_mi300x_docker.sh @@ -47,15 +47,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model=$MODEL --backend=vllm --base-url=http://0.0.0.0:$PORT \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$(( $CONC * 10 )) \ ---max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics="ttft,tpot,itl,e2el" \ ---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json \ No newline at end of file +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_mi300x_slurm.sh b/benchmarks/gptoss_fp4_mi300x_slurm.sh index 0e4a0b3b2..053c79197 100644 --- a/benchmarks/gptoss_fp4_mi300x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi300x_slurm.sh @@ -57,15 +57,17 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/gptoss_fp4_mi325x_docker.sh b/benchmarks/gptoss_fp4_mi325x_docker.sh index c57446da3..62d2c5bd0 100644 --- a/benchmarks/gptoss_fp4_mi325x_docker.sh +++ b/benchmarks/gptoss_fp4_mi325x_docker.sh @@ -46,15 +46,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model=$MODEL --backend=vllm --base-url=http://0.0.0.0:$PORT \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$(( $CONC * 10 )) \ ---max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics="ttft,tpot,itl,e2el" \ ---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh index 9cbef3276..c1ac421b6 100644 --- a/benchmarks/gptoss_fp4_mi325x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh @@ -57,16 +57,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh index 8209857bd..b26bf11b5 100644 --- a/benchmarks/gptoss_fp4_mi355x_docker.sh +++ b/benchmarks/gptoss_fp4_mi355x_docker.sh @@ -45,15 +45,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model=$MODEL --backend=vllm --base-url="http://localhost:$PORT" \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$NUM_PROMPTS \ ---max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics="ttft,tpot,itl,e2el" \ ---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$NUM_PROMPTS" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/gptoss_fp4_mi355x_slurm.sh b/benchmarks/gptoss_fp4_mi355x_slurm.sh index 1fcba771f..d378685db 100644 --- a/benchmarks/gptoss_fp4_mi355x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi355x_slurm.sh @@ -47,15 +47,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url "http://0.0.0.0:$PORT" \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics "ttft,tpot,itl,e2el" \ ---result-dir /workspace/ --result-filename $RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ From da39840f8ad049fb5486374b198303a5a5cfd866 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 14 Nov 2025 16:09:36 -0600 Subject: [PATCH 41/60] add utils function for benchmark --- benchmarks/benchmark_lib.sh | 1 + benchmarks/gptoss_fp4_h100_docker.sh | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 152b5e4b6..f253f18f4 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -124,6 +124,7 @@ run_benchmark_serving() { git clone https://github.com/kimbochen/bench_serving.git "$BENCH_SERVING_DIR" # Run benchmark + set -x python3 "$BENCH_SERVING_DIR/benchmark_serving.py" \ --model "$model" \ --backend "$backend" \ diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index 42bbf6b1a..f2b17f990 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -52,7 +52,6 @@ pip install -q datasets pandas # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ From ce96ec701f3862e4778eb50b3398ae2b09dd82ba Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 14 Nov 2025 16:30:16 -0600 Subject: [PATCH 42/60] function-ize the waiting for server to start --- benchmarks/benchmark_lib.sh | 68 +++++++++++++++++++++++++ benchmarks/dsr1_fp4_b200_docker.sh | 16 +++--- benchmarks/dsr1_fp4_b200_trt_slurm.sh | 13 ++--- benchmarks/dsr1_fp4_mi355x_docker.sh | 12 ++--- benchmarks/dsr1_fp4_mi355x_slurm.sh | 12 ++--- benchmarks/dsr1_fp8_b200_docker.sh | 16 +++--- benchmarks/dsr1_fp8_b200_trt_slurm.sh | 12 ++--- benchmarks/dsr1_fp8_h200_slurm.sh | 12 ++--- benchmarks/dsr1_fp8_h200_trt_slurm.sh | 16 ++---- benchmarks/dsr1_fp8_mi300x_docker.sh | 3 ++ benchmarks/dsr1_fp8_mi300x_slurm.sh | 12 ++--- benchmarks/dsr1_fp8_mi325x_docker.sh | 12 ++--- benchmarks/dsr1_fp8_mi325x_slurm.sh | 12 ++--- benchmarks/dsr1_fp8_mi355x_docker.sh | 16 ++---- benchmarks/dsr1_fp8_mi355x_slurm.sh | 12 ++--- benchmarks/gptoss_fp4_b200_docker.sh | 16 ++---- benchmarks/gptoss_fp4_b200_trt_slurm.sh | 13 ++--- benchmarks/gptoss_fp4_h100_docker.sh | 20 ++------ benchmarks/gptoss_fp4_h100_slurm.sh | 16 +++--- benchmarks/gptoss_fp4_h200_slurm.sh | 12 ++--- benchmarks/gptoss_fp4_h200_trt_slurm.sh | 12 ++--- benchmarks/gptoss_fp4_mi300x_docker.sh | 12 ++--- benchmarks/gptoss_fp4_mi300x_slurm.sh | 12 ++--- benchmarks/gptoss_fp4_mi325x_docker.sh | 12 ++--- benchmarks/gptoss_fp4_mi325x_slurm.sh | 12 ++--- benchmarks/gptoss_fp4_mi355x_docker.sh | 12 ++--- benchmarks/gptoss_fp4_mi355x_slurm.sh | 12 ++--- 27 files changed, 170 insertions(+), 235 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index f253f18f4..133f9095f 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -2,6 +2,74 @@ # Shared benchmarking utilities for InferenceMAX +# Wait for server to be ready by polling the health endpoint +# All parameters are required +# Parameters: +# --port: Server port +# --server-log: Path to server log file +# --server-pid: Server process ID (required) +# --sleep-interval: Sleep interval between health checks (optional, default: 5) +wait_for_server_ready() { + local port="" + local server_log="" + local server_pid="" + local sleep_interval=5 + + # Parse arguments + while [[ $# -gt 0 ]]; do + case $1 in + --port) + port="$2" + shift 2 + ;; + --server-log) + server_log="$2" + shift 2 + ;; + --server-pid) + server_pid="$2" + shift 2 + ;; + --sleep-interval) + sleep_interval="$2" + shift 2 + ;; + *) + echo "Unknown parameter: $1" + return 1 + ;; + esac + done + + # Validate required parameters + if [[ -z "$port" ]]; then + echo "Error: --port is required" + return 1 + fi + if [[ -z "$server_log" ]]; then + echo "Error: --server-log is required" + return 1 + fi + if [[ -z "$server_pid" ]]; then + echo "Error: --server-pid is required" + return 1 + fi + + # Show logs until server is ready + tail -f "$server_log" & + local TAIL_PID=$! + set +x + until curl --output /dev/null --silent --fail http://0.0.0.0:$port/health; do + if ! kill -0 "$server_pid" 2>/dev/null; then + echo "Server died before becoming healthy. Exiting." + kill $TAIL_PID + exit 1 + fi + sleep "$sleep_interval" + done + kill $TAIL_PID +} + # Run benchmark serving with standardized parameters # All parameters are required # Parameters: diff --git a/benchmarks/dsr1_fp4_b200_docker.sh b/benchmarks/dsr1_fp4_b200_docker.sh index 317015ba8..5f4ab3c5c 100644 --- a/benchmarks/dsr1_fp4_b200_docker.sh +++ b/benchmarks/dsr1_fp4_b200_docker.sh @@ -26,20 +26,16 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0. --ep-size $EP_SIZE --quantization modelopt_fp4 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ --enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 > $SERVER_LOG 2>&1 & -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID - -pip install -q datasets pandas +SERVER_PID=$! # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh index 897ef8527..a9f7cc9d4 100644 --- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh @@ -100,19 +100,14 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \ --extra_llm_api_options=$EXTRA_CONFIG_FILE \ > $SERVER_LOG 2>&1 & - -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID +SERVER_PID=$! # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/dsr1_fp4_mi355x_docker.sh b/benchmarks/dsr1_fp4_mi355x_docker.sh index 05603ecae..eed9d1273 100644 --- a/benchmarks/dsr1_fp4_mi355x_docker.sh +++ b/benchmarks/dsr1_fp4_mi355x_docker.sh @@ -31,18 +31,14 @@ python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \ --max-prefill-tokens=$PREFILL_SIZE \ --cuda-graph-max-bs=128 > $SERVER_LOG 2>&1 & -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID +SERVER_PID=$! # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/dsr1_fp4_mi355x_slurm.sh b/benchmarks/dsr1_fp4_mi355x_slurm.sh index c47bbfb38..afb7ca29c 100644 --- a/benchmarks/dsr1_fp4_mi355x_slurm.sh +++ b/benchmarks/dsr1_fp4_mi355x_slurm.sh @@ -34,18 +34,12 @@ python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \ --cuda-graph-max-bs=128 \ > $SERVER_LOG 2>&1 & -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID - # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/dsr1_fp8_b200_docker.sh b/benchmarks/dsr1_fp8_b200_docker.sh index fa498ff3e..9a219339c 100644 --- a/benchmarks/dsr1_fp8_b200_docker.sh +++ b/benchmarks/dsr1_fp8_b200_docker.sh @@ -37,20 +37,16 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0. --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL --disable-radix-cache \ --attention-backend trtllm_mla --stream-interval 30 --ep-size $EP_SIZE --moe-runner-backend flashinfer_trtllm --quantization fp8 > $SERVER_LOG 2>&1 & -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID - -pip install -q datasets pandas +SERVER_PID=$! # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh index a22536d82..a78fece38 100644 --- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh @@ -72,18 +72,12 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \ SERVER_PID=$! -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID - # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/dsr1_fp8_h200_slurm.sh b/benchmarks/dsr1_fp8_h200_slurm.sh index 7444f763f..6eeec6df1 100644 --- a/benchmarks/dsr1_fp8_h200_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_slurm.sh @@ -44,18 +44,14 @@ else > $SERVER_LOG 2>&1 & fi -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID +SERVER_PID=$! # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh index 94baa7850..74b2ce8df 100644 --- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh @@ -72,22 +72,12 @@ PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \ SERVER_PID=$! -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - if ! kill -0 $SERVER_PID 2>/dev/null; then - echo "Server died before becoming healthy. Exiting." - exit 1 - fi - sleep 5 -done -kill $TAIL_PID - # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/dsr1_fp8_mi300x_docker.sh b/benchmarks/dsr1_fp8_mi300x_docker.sh index 5ffebb941..db27f4e74 100644 --- a/benchmarks/dsr1_fp8_mi300x_docker.sh +++ b/benchmarks/dsr1_fp8_mi300x_docker.sh @@ -50,6 +50,9 @@ kill $TAIL_PID # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/dsr1_fp8_mi300x_slurm.sh b/benchmarks/dsr1_fp8_mi300x_slurm.sh index ba1597982..0d191299c 100644 --- a/benchmarks/dsr1_fp8_mi300x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi300x_slurm.sh @@ -47,18 +47,14 @@ python3 -m sglang.launch_server \ --disable-radix-cache \ > $SERVER_LOG 2>&1 & -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID +SERVER_PID=$! # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/dsr1_fp8_mi325x_docker.sh b/benchmarks/dsr1_fp8_mi325x_docker.sh index c0f95846d..9f34e7563 100644 --- a/benchmarks/dsr1_fp8_mi325x_docker.sh +++ b/benchmarks/dsr1_fp8_mi325x_docker.sh @@ -28,18 +28,14 @@ python3 -m sglang.launch_server \ --max-prefill-tokens 196608 \ --cuda-graph-max-bs 128 > $SERVER_LOG 2>&1 & -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID +SERVER_PID=$! # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/dsr1_fp8_mi325x_slurm.sh b/benchmarks/dsr1_fp8_mi325x_slurm.sh index 1ccec681f..15e9cce64 100644 --- a/benchmarks/dsr1_fp8_mi325x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi325x_slurm.sh @@ -23,18 +23,12 @@ python3 -m sglang.launch_server \ --disable-radix-cache \ > $SERVER_LOG 2>&1 & -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID - # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/dsr1_fp8_mi355x_docker.sh b/benchmarks/dsr1_fp8_mi355x_docker.sh index 50d9bb02d..a86d7adbe 100644 --- a/benchmarks/dsr1_fp8_mi355x_docker.sh +++ b/benchmarks/dsr1_fp8_mi355x_docker.sh @@ -28,14 +28,11 @@ python3 -m sglang.launch_server \ --max-prefill-tokens 196608 \ --cuda-graph-max-bs 128 > $SERVER_LOG 2>&1 & -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" if [[ "$MODEL" == "amd/DeepSeek-R1-0528-MXFP4-Preview" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then if [[ "$OSL" == "8192" ]]; then @@ -47,9 +44,6 @@ else NUM_PROMPTS=$(( CONC * 10 )) fi -# Source benchmark utilities -source "$(dirname "$0")/benchmark_lib.sh" - set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/dsr1_fp8_mi355x_slurm.sh b/benchmarks/dsr1_fp8_mi355x_slurm.sh index 86b5f9649..54ba29fc0 100644 --- a/benchmarks/dsr1_fp8_mi355x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi355x_slurm.sh @@ -32,18 +32,12 @@ python3 -m sglang.launch_server \ --max-prefill-tokens 196608 \ --cuda-graph-max-bs 128 > $SERVER_LOG 2>&1 & -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID - # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh index 208ea278d..60c1a1582 100644 --- a/benchmarks/gptoss_fp4_b200_docker.sh +++ b/benchmarks/gptoss_fp4_b200_docker.sh @@ -53,20 +53,14 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \ SERVER_PID=$! -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID - -pip install -q datasets pandas - # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh index 4647cb346..0ec2f325f 100644 --- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh @@ -78,19 +78,14 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \ --extra_llm_api_options=$EXTRA_CONFIG_FILE \ > $SERVER_LOG 2>&1 & - -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID +SERVER_PID=$! # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index f2b17f990..9cf7c5275 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -34,24 +34,14 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ SERVER_PID=$! -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - if ! kill -0 $SERVER_PID 2>/dev/null; then - echo "Server died before becoming healthy. Exiting." - exit 1 - fi - sleep 5 -done -kill $TAIL_PID - -pip install -q datasets pandas - # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh index 5f31f0abf..c3d598116 100644 --- a/benchmarks/gptoss_fp4_h100_slurm.sh +++ b/benchmarks/gptoss_fp4_h100_slurm.sh @@ -35,20 +35,16 @@ PYTHONNOUSERSITE=1 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --max-num-seqs=$CONC \ --disable-log-requests > $SERVER_LOG 2>&1 & -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID - -pip install -q datasets pandas +SERVER_PID=$! # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh index 146ab16a5..a3e47ca44 100644 --- a/benchmarks/gptoss_fp4_h200_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_slurm.sh @@ -48,18 +48,14 @@ PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC \ --disable-log-requests > $SERVER_LOG 2>&1 & -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID +SERVER_PID=$! # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh index ffe6e65de..81f1f67de 100644 --- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh @@ -59,18 +59,14 @@ trtllm-serve $MODEL \ --pp_size=1 \ > $SERVER_LOG 2>&1 & -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID +SERVER_PID=$! # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh index a3b1dc4f3..003ebf90e 100644 --- a/benchmarks/gptoss_fp4_mi300x_docker.sh +++ b/benchmarks/gptoss_fp4_mi300x_docker.sh @@ -38,18 +38,12 @@ vllm serve $MODEL --port $PORT \ --disable-log-requests \ --async-scheduling > $SERVER_LOG 2>&1 & -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID - # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/gptoss_fp4_mi300x_slurm.sh b/benchmarks/gptoss_fp4_mi300x_slurm.sh index 053c79197..a9e164cc2 100644 --- a/benchmarks/gptoss_fp4_mi300x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi300x_slurm.sh @@ -48,18 +48,14 @@ vllm serve $MODEL --port $PORT \ --async-scheduling \ > $SERVER_LOG 2>&1 & -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID +SERVER_PID=$! # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/gptoss_fp4_mi325x_docker.sh b/benchmarks/gptoss_fp4_mi325x_docker.sh index 62d2c5bd0..a000b462f 100644 --- a/benchmarks/gptoss_fp4_mi325x_docker.sh +++ b/benchmarks/gptoss_fp4_mi325x_docker.sh @@ -37,18 +37,12 @@ vllm serve $MODEL --port $PORT \ --disable-log-requests \ --async-scheduling > $SERVER_LOG 2>&1 & -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID - # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh index c1ac421b6..a9dbff484 100644 --- a/benchmarks/gptoss_fp4_mi325x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh @@ -48,18 +48,12 @@ vllm serve $MODEL --port $PORT \ --async-scheduling \ > $SERVER_LOG 2>&1 & -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID - # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh index b26bf11b5..e7399694f 100644 --- a/benchmarks/gptoss_fp4_mi355x_docker.sh +++ b/benchmarks/gptoss_fp4_mi355x_docker.sh @@ -36,18 +36,12 @@ vllm serve $MODEL --port $PORT \ --disable-log-requests \ --async-scheduling > $SERVER_LOG 2>&1 & -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID - # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/gptoss_fp4_mi355x_slurm.sh b/benchmarks/gptoss_fp4_mi355x_slurm.sh index d378685db..2f4d84927 100644 --- a/benchmarks/gptoss_fp4_mi355x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi355x_slurm.sh @@ -38,18 +38,12 @@ vllm serve $MODEL --port $PORT \ --disable-log-requests \ --async-scheduling > $SERVER_LOG 2>&1 & -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID - # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + set -x run_benchmark_serving \ --model "$MODEL" \ From b02b77e4329b7eede39f248a4417a7df2bf7541f Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 14 Nov 2025 16:31:55 -0600 Subject: [PATCH 43/60] dont show arg parsing set -x --- benchmarks/benchmark_lib.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 133f9095f..85d221a92 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -10,6 +10,7 @@ # --server-pid: Server process ID (required) # --sleep-interval: Sleep interval between health checks (optional, default: 5) wait_for_server_ready() { + set -x local port="" local server_log="" local server_pid="" From 01e856194fd4a51d04cf828cc6d331a81e06f068 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 14 Nov 2025 16:36:10 -0600 Subject: [PATCH 44/60] dont show arg parsing set +x oops --- benchmarks/benchmark_lib.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 85d221a92..8e52b949d 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -10,7 +10,7 @@ # --server-pid: Server process ID (required) # --sleep-interval: Sleep interval between health checks (optional, default: 5) wait_for_server_ready() { - set -x + set +x local port="" local server_log="" local server_pid="" From c2c6c3c6693b919685423320d237efca8d6f7cb9 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 14 Nov 2025 16:44:58 -0600 Subject: [PATCH 45/60] dont show arg parsing set +x oops --- benchmarks/benchmark_lib.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 8e52b949d..cc3448d40 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -59,7 +59,6 @@ wait_for_server_ready() { # Show logs until server is ready tail -f "$server_log" & local TAIL_PID=$! - set +x until curl --output /dev/null --silent --fail http://0.0.0.0:$port/health; do if ! kill -0 "$server_pid" 2>/dev/null; then echo "Server died before becoming healthy. Exiting." @@ -85,6 +84,7 @@ wait_for_server_ready() { # --result-filename: Result filename without extension # --result-dir: Result directory run_benchmark_serving() { + set +x local model="" local port="" local backend="" @@ -210,4 +210,5 @@ run_benchmark_serving() { --percentile-metrics 'ttft,tpot,itl,e2el' \ --result-dir "$result_dir" \ --result-filename "$result_filename.json" + set +x } From a56be97ef993771abbfc3c351bd1454582f94b63 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 14 Nov 2025 17:04:18 -0600 Subject: [PATCH 46/60] capture server pid --- benchmarks/dsr1_fp4_b200_docker.sh | 1 - benchmarks/dsr1_fp4_b200_trt_slurm.sh | 1 - benchmarks/dsr1_fp4_mi355x_docker.sh | 1 - benchmarks/dsr1_fp4_mi355x_slurm.sh | 3 ++- benchmarks/dsr1_fp8_b200_docker.sh | 1 - benchmarks/dsr1_fp8_b200_trt_slurm.sh | 1 - benchmarks/dsr1_fp8_h200_slurm.sh | 1 - benchmarks/dsr1_fp8_h200_trt_slurm.sh | 1 - benchmarks/dsr1_fp8_mi300x_docker.sh | 11 +---------- benchmarks/dsr1_fp8_mi300x_slurm.sh | 1 - benchmarks/dsr1_fp8_mi325x_docker.sh | 1 - benchmarks/dsr1_fp8_mi325x_slurm.sh | 3 ++- benchmarks/dsr1_fp8_mi355x_docker.sh | 3 ++- benchmarks/dsr1_fp8_mi355x_slurm.sh | 3 ++- benchmarks/gptoss_fp4_b200_docker.sh | 1 - benchmarks/gptoss_fp4_h100_slurm.sh | 1 - benchmarks/gptoss_fp4_h200_slurm.sh | 1 - benchmarks/gptoss_fp4_h200_trt_slurm.sh | 1 - benchmarks/gptoss_fp4_mi300x_docker.sh | 3 ++- benchmarks/gptoss_fp4_mi325x_docker.sh | 3 ++- benchmarks/gptoss_fp4_mi325x_slurm.sh | 3 ++- benchmarks/gptoss_fp4_mi355x_docker.sh | 3 ++- benchmarks/gptoss_fp4_mi355x_slurm.sh | 3 ++- 23 files changed, 19 insertions(+), 32 deletions(-) diff --git a/benchmarks/dsr1_fp4_b200_docker.sh b/benchmarks/dsr1_fp4_b200_docker.sh index 5f4ab3c5c..a520871fa 100644 --- a/benchmarks/dsr1_fp4_b200_docker.sh +++ b/benchmarks/dsr1_fp4_b200_docker.sh @@ -36,7 +36,6 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S pip install -q datasets pandas -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh index a9f7cc9d4..b4227e428 100644 --- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh @@ -108,7 +108,6 @@ source "$(dirname "$0")/benchmark_lib.sh" # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/dsr1_fp4_mi355x_docker.sh b/benchmarks/dsr1_fp4_mi355x_docker.sh index eed9d1273..f19b6df2e 100644 --- a/benchmarks/dsr1_fp4_mi355x_docker.sh +++ b/benchmarks/dsr1_fp4_mi355x_docker.sh @@ -39,7 +39,6 @@ source "$(dirname "$0")/benchmark_lib.sh" # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/dsr1_fp4_mi355x_slurm.sh b/benchmarks/dsr1_fp4_mi355x_slurm.sh index afb7ca29c..f4d7f1d39 100644 --- a/benchmarks/dsr1_fp4_mi355x_slurm.sh +++ b/benchmarks/dsr1_fp4_mi355x_slurm.sh @@ -34,13 +34,14 @@ python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \ --cuda-graph-max-bs=128 \ > $SERVER_LOG 2>&1 & +SERVER_PID=$! + # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/dsr1_fp8_b200_docker.sh b/benchmarks/dsr1_fp8_b200_docker.sh index 9a219339c..ffa7644bd 100644 --- a/benchmarks/dsr1_fp8_b200_docker.sh +++ b/benchmarks/dsr1_fp8_b200_docker.sh @@ -47,7 +47,6 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S pip install -q datasets pandas -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh index a78fece38..a9a1a04ff 100644 --- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh @@ -78,7 +78,6 @@ source "$(dirname "$0")/benchmark_lib.sh" # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/dsr1_fp8_h200_slurm.sh b/benchmarks/dsr1_fp8_h200_slurm.sh index 6eeec6df1..06345ecb2 100644 --- a/benchmarks/dsr1_fp8_h200_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_slurm.sh @@ -52,7 +52,6 @@ source "$(dirname "$0")/benchmark_lib.sh" # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh index 74b2ce8df..4ece6f7bc 100644 --- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh @@ -78,7 +78,6 @@ source "$(dirname "$0")/benchmark_lib.sh" # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/dsr1_fp8_mi300x_docker.sh b/benchmarks/dsr1_fp8_mi300x_docker.sh index db27f4e74..8c269dd83 100644 --- a/benchmarks/dsr1_fp8_mi300x_docker.sh +++ b/benchmarks/dsr1_fp8_mi300x_docker.sh @@ -37,15 +37,7 @@ python3 -m sglang.launch_server \ --max-prefill-tokens=196608 \ --disable-radix-cache > $SERVER_LOG 2>&1 & - -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID +SERVER_PID=$! # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" @@ -53,7 +45,6 @@ source "$(dirname "$0")/benchmark_lib.sh" # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/dsr1_fp8_mi300x_slurm.sh b/benchmarks/dsr1_fp8_mi300x_slurm.sh index 0d191299c..5fad7a587 100644 --- a/benchmarks/dsr1_fp8_mi300x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi300x_slurm.sh @@ -55,7 +55,6 @@ source "$(dirname "$0")/benchmark_lib.sh" # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/dsr1_fp8_mi325x_docker.sh b/benchmarks/dsr1_fp8_mi325x_docker.sh index 9f34e7563..565b8fb45 100644 --- a/benchmarks/dsr1_fp8_mi325x_docker.sh +++ b/benchmarks/dsr1_fp8_mi325x_docker.sh @@ -36,7 +36,6 @@ source "$(dirname "$0")/benchmark_lib.sh" # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/dsr1_fp8_mi325x_slurm.sh b/benchmarks/dsr1_fp8_mi325x_slurm.sh index 15e9cce64..67e4cc394 100644 --- a/benchmarks/dsr1_fp8_mi325x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi325x_slurm.sh @@ -23,13 +23,14 @@ python3 -m sglang.launch_server \ --disable-radix-cache \ > $SERVER_LOG 2>&1 & +SERVER_PID=$! + # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/dsr1_fp8_mi355x_docker.sh b/benchmarks/dsr1_fp8_mi355x_docker.sh index a86d7adbe..d4f1dd013 100644 --- a/benchmarks/dsr1_fp8_mi355x_docker.sh +++ b/benchmarks/dsr1_fp8_mi355x_docker.sh @@ -28,6 +28,8 @@ python3 -m sglang.launch_server \ --max-prefill-tokens 196608 \ --cuda-graph-max-bs 128 > $SERVER_LOG 2>&1 & +SERVER_PID=$! + # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" @@ -44,7 +46,6 @@ else NUM_PROMPTS=$(( CONC * 10 )) fi -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/dsr1_fp8_mi355x_slurm.sh b/benchmarks/dsr1_fp8_mi355x_slurm.sh index 54ba29fc0..fd6fe49fb 100644 --- a/benchmarks/dsr1_fp8_mi355x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi355x_slurm.sh @@ -32,13 +32,14 @@ python3 -m sglang.launch_server \ --max-prefill-tokens 196608 \ --cuda-graph-max-bs 128 > $SERVER_LOG 2>&1 & +SERVER_PID=$! + # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh index 60c1a1582..1736701c4 100644 --- a/benchmarks/gptoss_fp4_b200_docker.sh +++ b/benchmarks/gptoss_fp4_b200_docker.sh @@ -61,7 +61,6 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S pip install -q datasets pandas -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh index c3d598116..843219b95 100644 --- a/benchmarks/gptoss_fp4_h100_slurm.sh +++ b/benchmarks/gptoss_fp4_h100_slurm.sh @@ -45,7 +45,6 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S pip install -q datasets pandas -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh index a3e47ca44..dc29baf8d 100644 --- a/benchmarks/gptoss_fp4_h200_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_slurm.sh @@ -56,7 +56,6 @@ source "$(dirname "$0")/benchmark_lib.sh" # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh index 81f1f67de..21d6ae02c 100644 --- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh @@ -67,7 +67,6 @@ source "$(dirname "$0")/benchmark_lib.sh" # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh index 003ebf90e..7d1f98226 100644 --- a/benchmarks/gptoss_fp4_mi300x_docker.sh +++ b/benchmarks/gptoss_fp4_mi300x_docker.sh @@ -38,13 +38,14 @@ vllm serve $MODEL --port $PORT \ --disable-log-requests \ --async-scheduling > $SERVER_LOG 2>&1 & +SERVER_PID=$! + # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/gptoss_fp4_mi325x_docker.sh b/benchmarks/gptoss_fp4_mi325x_docker.sh index a000b462f..46462ad6d 100644 --- a/benchmarks/gptoss_fp4_mi325x_docker.sh +++ b/benchmarks/gptoss_fp4_mi325x_docker.sh @@ -37,13 +37,14 @@ vllm serve $MODEL --port $PORT \ --disable-log-requests \ --async-scheduling > $SERVER_LOG 2>&1 & +SERVER_PID=$! + # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh index a9dbff484..f15e6261c 100644 --- a/benchmarks/gptoss_fp4_mi325x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh @@ -48,13 +48,14 @@ vllm serve $MODEL --port $PORT \ --async-scheduling \ > $SERVER_LOG 2>&1 & +SERVER_PID=$! + # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh index e7399694f..0e54245d4 100644 --- a/benchmarks/gptoss_fp4_mi355x_docker.sh +++ b/benchmarks/gptoss_fp4_mi355x_docker.sh @@ -36,13 +36,14 @@ vllm serve $MODEL --port $PORT \ --disable-log-requests \ --async-scheduling > $SERVER_LOG 2>&1 & +SERVER_PID=$! + # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/gptoss_fp4_mi355x_slurm.sh b/benchmarks/gptoss_fp4_mi355x_slurm.sh index 2f4d84927..a2adf2952 100644 --- a/benchmarks/gptoss_fp4_mi355x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi355x_slurm.sh @@ -38,13 +38,14 @@ vllm serve $MODEL --port $PORT \ --disable-log-requests \ --async-scheduling > $SERVER_LOG 2>&1 & +SERVER_PID=$! + # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ From 32a0d23e8654d1efd2e74380bb2bcc9d9782eebf Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Sun, 16 Nov 2025 18:46:26 -0600 Subject: [PATCH 47/60] nebdius dont scancel --- runners/launch_b200-nb.sh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/runners/launch_b200-nb.sh b/runners/launch_b200-nb.sh index ecd1466dd..44392e3aa 100644 --- a/runners/launch_b200-nb.sh +++ b/runners/launch_b200-nb.sh @@ -12,6 +12,4 @@ srun --partition=$PARTITION --gres=gpu:$TP --exclusive \ --no-container-mount-home --container-writable \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT_OFFSET=${USER: -1} \ -bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_slurm.sh - -scancel $JOB_ID \ No newline at end of file +bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_slurm.sh \ No newline at end of file From 2a085dc5a868229a462336d06a7d68b1b2139642 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 17 Nov 2025 10:12:55 -0600 Subject: [PATCH 48/60] changes to comments in benchmark lib . sh --- benchmarks/benchmark_lib.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index cc3448d40..0458cfb78 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -75,7 +75,7 @@ wait_for_server_ready() { # Parameters: # --model: Model name # --port: Server port -# --backend: Backend type - 'vllm' or 'openai' +# --backend: Backend type - e.g., 'vllm' or 'openai' # --input-len: Random input sequence length # --output-len: Random output sequence length # --random-range-ratio: Random range ratio From 18cf708048327e4ed4770c4551d296e31fb19ce1 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Mon, 17 Nov 2025 13:17:43 -0600 Subject: [PATCH 49/60] Update benchmarks/dsr1_fp4_mi355x_docker.sh Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- benchmarks/dsr1_fp4_mi355x_docker.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/benchmarks/dsr1_fp4_mi355x_docker.sh b/benchmarks/dsr1_fp4_mi355x_docker.sh index f19b6df2e..c20a9f63f 100644 --- a/benchmarks/dsr1_fp4_mi355x_docker.sh +++ b/benchmarks/dsr1_fp4_mi355x_docker.sh @@ -50,5 +50,3 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ - - From 380919422586588b48889e376a6a56e6c0aff45f Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Mon, 17 Nov 2025 13:17:53 -0600 Subject: [PATCH 50/60] Update .github/workflows/benchmark-tmpl.yml Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .github/workflows/benchmark-tmpl.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 4496ac001..c95359af5 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -143,7 +143,6 @@ jobs: echo "Waiting for result file... (attempt $i)" sleep 1 done - if [ -z "$FOUND_RESULT_FILE" ]; then echo "Run failed: Benchmark result $RESULT_FILENAME.json not found." >&2 exit 1 From 0c752382623d8dd314b720a325f714b97879d4a5 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 17 Nov 2025 14:00:30 -0600 Subject: [PATCH 51/60] adding back whitespace --- .github/workflows/benchmark-tmpl.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index c95359af5..3cae450cd 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -143,6 +143,7 @@ jobs: echo "Waiting for result file... (attempt $i)" sleep 1 done + if [ -z "$FOUND_RESULT_FILE" ]; then echo "Run failed: Benchmark result $RESULT_FILENAME.json not found." >&2 exit 1 From 6dad972f92fec3e063876d1364c3461a71947a2d Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 17 Nov 2025 14:00:43 -0600 Subject: [PATCH 52/60] adding back whitespace --- .github/workflows/benchmark-tmpl.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 3cae450cd..2d0619939 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -143,7 +143,7 @@ jobs: echo "Waiting for result file... (attempt $i)" sleep 1 done - + if [ -z "$FOUND_RESULT_FILE" ]; then echo "Run failed: Benchmark result $RESULT_FILENAME.json not found." >&2 exit 1 From 80b2cbb353256df17e1b9d4baaafb02f1b43bd21 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 17 Nov 2025 14:01:08 -0600 Subject: [PATCH 53/60] adding back whitespace --- .github/workflows/benchmark-tmpl.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 2d0619939..4496ac001 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -143,7 +143,7 @@ jobs: echo "Waiting for result file... (attempt $i)" sleep 1 done - + if [ -z "$FOUND_RESULT_FILE" ]; then echo "Run failed: Benchmark result $RESULT_FILENAME.json not found." >&2 exit 1 From 227db82d395f10147a8a3311d62cf7ee8632bd94 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 17 Nov 2025 14:10:58 -0600 Subject: [PATCH 54/60] remove tg launch script --- runners/launch_b200-tg.sh | 19 ------------------- 1 file changed, 19 deletions(-) delete mode 100644 runners/launch_b200-tg.sh diff --git a/runners/launch_b200-tg.sh b/runners/launch_b200-tg.sh deleted file mode 100644 index 97e975a64..000000000 --- a/runners/launch_b200-tg.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/usr/bin/bash - -HF_HUB_CACHE_MOUNT="/dev/shm/hf_hub_cache/" -FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') -PORT=8888 - -server_name="bmk-server" - -set -x -docker run --rm -d --network host --name $server_name \ ---runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ --v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \ --e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ --e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ ---entrypoint=/bin/bash \ -$(echo "$IMAGE" | sed 's/#/\//') \ -benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh" From 97edfeea3c591fd22c1ac0b39fa8aacbe5e7b70f Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Fri, 21 Nov 2025 15:02:31 -0600 Subject: [PATCH 55/60] Update benchmarks/gptoss_fp4_h100_docker.sh Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- benchmarks/gptoss_fp4_h100_docker.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index 9cf7c5275..f38ea3293 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -10,6 +10,7 @@ # CONC # ISL # OSL +# RESULT_FILENAME cat > config.yaml << EOF From b6a80fbcddce3c60e18ae45b9b5b76e6bff7ff35 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Fri, 21 Nov 2025 15:03:21 -0600 Subject: [PATCH 56/60] Update benchmarks/dsr1_fp8_mi325x_docker.sh Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- benchmarks/dsr1_fp8_mi325x_docker.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/benchmarks/dsr1_fp8_mi325x_docker.sh b/benchmarks/dsr1_fp8_mi325x_docker.sh index 565b8fb45..72c89571d 100644 --- a/benchmarks/dsr1_fp8_mi325x_docker.sh +++ b/benchmarks/dsr1_fp8_mi325x_docker.sh @@ -47,4 +47,3 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ - From fa1e2c0ec5094705b9a973f3f890e46d6594d083 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Fri, 21 Nov 2025 15:03:52 -0600 Subject: [PATCH 57/60] Update benchmarks/dsr1_fp8_mi355x_docker.sh Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- benchmarks/dsr1_fp8_mi355x_docker.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/benchmarks/dsr1_fp8_mi355x_docker.sh b/benchmarks/dsr1_fp8_mi355x_docker.sh index d4f1dd013..128810b42 100644 --- a/benchmarks/dsr1_fp8_mi355x_docker.sh +++ b/benchmarks/dsr1_fp8_mi355x_docker.sh @@ -58,5 +58,3 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ - - From a5ebc4a8d6adb08bda821aad17de44468fef3b9d Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Fri, 21 Nov 2025 15:04:47 -0600 Subject: [PATCH 58/60] Update benchmarks/gptoss_fp4_b200_trt_slurm.sh Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- benchmarks/gptoss_fp4_b200_trt_slurm.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh index 0ec2f325f..7542cd1a0 100644 --- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh @@ -86,7 +86,6 @@ source "$(dirname "$0")/benchmark_lib.sh" # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ From 9ff641e520902233abc608e8f6ed4b00c7336b68 Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Fri, 21 Nov 2025 15:41:20 -0600 Subject: [PATCH 59/60] Audit and correct required environment variables documentation in all benchmark scripts (#252) * Initial plan * Update required env vars documentation in all benchmark scripts Co-authored-by: cquil11 <60715037+cquil11@users.noreply.github.com> * Fix required env vars - remove NF, PREFILL_SIZE, and correct PORT/PORT_OFFSET Co-authored-by: cquil11 <60715037+cquil11@users.noreply.github.com> * Remove internally-calculated vars from required env vars (EXTRA_CONFIG_FILE, MAX_NUM_TOKENS, MOE_BACKEND) Co-authored-by: cquil11 <60715037+cquil11@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: cquil11 <60715037+cquil11@users.noreply.github.com> --- benchmarks/dsr1_fp4_b200_docker.sh | 12 ++++++++++++ benchmarks/dsr1_fp4_b200_trt_slurm.sh | 9 +++------ benchmarks/dsr1_fp4_mi355x_docker.sh | 13 +++++++------ benchmarks/dsr1_fp4_mi355x_slurm.sh | 11 ++++------- benchmarks/dsr1_fp8_b200_docker.sh | 11 +++++++---- benchmarks/dsr1_fp8_b200_trt_slurm.sh | 9 +++------ benchmarks/dsr1_fp8_h200_slurm.sh | 10 +++------- benchmarks/dsr1_fp8_h200_trt_slurm.sh | 9 +++------ benchmarks/dsr1_fp8_mi300x_docker.sh | 9 +++++---- benchmarks/dsr1_fp8_mi300x_slurm.sh | 10 +++------- benchmarks/dsr1_fp8_mi325x_docker.sh | 9 +++++---- benchmarks/dsr1_fp8_mi325x_slurm.sh | 9 +++++++++ benchmarks/dsr1_fp8_mi355x_docker.sh | 9 +++++---- benchmarks/dsr1_fp8_mi355x_slurm.sh | 11 ++++------- benchmarks/gptoss_fp4_b200_docker.sh | 12 +++++------- benchmarks/gptoss_fp4_b200_trt_slurm.sh | 9 +++------ benchmarks/gptoss_fp4_h100_docker.sh | 8 +++----- benchmarks/gptoss_fp4_h100_slurm.sh | 11 ++++------- benchmarks/gptoss_fp4_h200_slurm.sh | 10 +++------- benchmarks/gptoss_fp4_h200_trt_slurm.sh | 10 +++------- benchmarks/gptoss_fp4_mi300x_docker.sh | 8 +++++--- benchmarks/gptoss_fp4_mi300x_slurm.sh | 9 +++------ benchmarks/gptoss_fp4_mi325x_docker.sh | 8 +++++--- benchmarks/gptoss_fp4_mi325x_slurm.sh | 9 +++------ benchmarks/gptoss_fp4_mi355x_docker.sh | 7 ++++--- benchmarks/gptoss_fp4_mi355x_slurm.sh | 11 ++++------- 26 files changed, 118 insertions(+), 135 deletions(-) diff --git a/benchmarks/dsr1_fp4_b200_docker.sh b/benchmarks/dsr1_fp4_b200_docker.sh index a520871fa..4ff123a32 100644 --- a/benchmarks/dsr1_fp4_b200_docker.sh +++ b/benchmarks/dsr1_fp4_b200_docker.sh @@ -1,5 +1,17 @@ #!/usr/bin/env bash +# === Required Env Vars === +# MODEL +# PORT +# TP +# CONC +# ISL +# OSL +# RANDOM_RANGE_RATIO +# RESULT_FILENAME +# EP_SIZE +# NUM_PROMPTS + nvidia-smi # To improve CI stability, we patch this helper function to prevent a race condition that diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh index b4227e428..aa2be7648 100644 --- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh @@ -1,16 +1,13 @@ #!/usr/bin/env bash -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE +# === Required Env Vars === # MODEL +# TP +# CONC # ISL # OSL # MAX_MODEL_LEN # RANDOM_RANGE_RATIO -# TP -# CONC # RESULT_FILENAME # PORT_OFFSET # DP_ATTENTION diff --git a/benchmarks/dsr1_fp4_mi355x_docker.sh b/benchmarks/dsr1_fp4_mi355x_docker.sh index c20a9f63f..ca1255802 100644 --- a/benchmarks/dsr1_fp4_mi355x_docker.sh +++ b/benchmarks/dsr1_fp4_mi355x_docker.sh @@ -1,14 +1,15 @@ #!/usr/bin/env bash -# ========= Required Env Vars ========= -# HF_TOKEN -# HF_HUB_CACHE +# === Required Env Vars === # MODEL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO +# PORT # TP # CONC -# PORT +# ISL +# OSL +# RANDOM_RANGE_RATIO +# RESULT_FILENAME +# NUM_PROMPTS export SGLANG_USE_AITER=1 PREFILL_SIZE=196608 diff --git a/benchmarks/dsr1_fp4_mi355x_slurm.sh b/benchmarks/dsr1_fp4_mi355x_slurm.sh index f4d7f1d39..0983b7ddf 100644 --- a/benchmarks/dsr1_fp4_mi355x_slurm.sh +++ b/benchmarks/dsr1_fp4_mi355x_slurm.sh @@ -1,16 +1,13 @@ #!/usr/bin/env bash -# ========= Required Env Vars ========= -# HF_TOKEN -# HF_HUB_CACHE +# === Required Env Vars === # MODEL +# PORT +# TP +# CONC # ISL # OSL -# MAX_MODEL_LEN # RANDOM_RANGE_RATIO -# TP -# CONC -# PORT # RESULT_FILENAME export SGLANG_USE_AITER=1 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) diff --git a/benchmarks/dsr1_fp8_b200_docker.sh b/benchmarks/dsr1_fp8_b200_docker.sh index ffa7644bd..4d8a9ff18 100644 --- a/benchmarks/dsr1_fp8_b200_docker.sh +++ b/benchmarks/dsr1_fp8_b200_docker.sh @@ -1,13 +1,16 @@ #!/usr/bin/env bash -# ========= Required Env Vars ========= -# HF_TOKEN -# HF_HUB_CACHE +# === Required Env Vars === # MODEL # PORT # TP # CONC -# MAX_MODEL_LEN +# ISL +# OSL +# RANDOM_RANGE_RATIO +# RESULT_FILENAME +# EP_SIZE +# NUM_PROMPTS nvidia-smi diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh index a9a1a04ff..58d24a7ed 100644 --- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh @@ -1,16 +1,13 @@ #!/usr/bin/env bash -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE +# === Required Env Vars === # MODEL +# TP +# CONC # ISL # OSL # MAX_MODEL_LEN # RANDOM_RANGE_RATIO -# TP -# CONC # RESULT_FILENAME # PORT_OFFSET # DP_ATTENTION diff --git a/benchmarks/dsr1_fp8_h200_slurm.sh b/benchmarks/dsr1_fp8_h200_slurm.sh index 06345ecb2..f84d741d6 100644 --- a/benchmarks/dsr1_fp8_h200_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_slurm.sh @@ -1,16 +1,12 @@ #!/usr/bin/env bash -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE +# === Required Env Vars === # MODEL +# TP +# CONC # ISL # OSL -# MAX_MODEL_LEN # RANDOM_RANGE_RATIO -# TP -# CONC # RESULT_FILENAME # PORT_OFFSET diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh index 4ece6f7bc..ac6bc167c 100644 --- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh @@ -1,16 +1,13 @@ #!/usr/bin/env bash -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE +# === Required Env Vars === # MODEL +# TP +# CONC # ISL # OSL # MAX_MODEL_LEN # RANDOM_RANGE_RATIO -# TP -# CONC # RESULT_FILENAME # PORT_OFFSET # DP_ATTENTION diff --git a/benchmarks/dsr1_fp8_mi300x_docker.sh b/benchmarks/dsr1_fp8_mi300x_docker.sh index 8c269dd83..e92765ebb 100644 --- a/benchmarks/dsr1_fp8_mi300x_docker.sh +++ b/benchmarks/dsr1_fp8_mi300x_docker.sh @@ -1,13 +1,14 @@ #!/usr/bin/env bash -# ========= Required Env Vars ========= -# HF_TOKEN -# HF_HUB_CACHE +# === Required Env Vars === # MODEL # PORT # TP # CONC -# MAX_MODEL_LEN +# ISL +# OSL +# RANDOM_RANGE_RATIO +# RESULT_FILENAME # Reference # https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-sglang-deepseek-r1-fp8.html#run-the-inference-benchmark diff --git a/benchmarks/dsr1_fp8_mi300x_slurm.sh b/benchmarks/dsr1_fp8_mi300x_slurm.sh index 5fad7a587..662f4bdfb 100644 --- a/benchmarks/dsr1_fp8_mi300x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi300x_slurm.sh @@ -1,16 +1,12 @@ #!/usr/bin/bash -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE +# === Required Env Vars === # MODEL +# TP +# CONC # ISL # OSL -# MAX_MODEL_LEN # RANDOM_RANGE_RATIO -# TP -# CONC # RESULT_FILENAME echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" diff --git a/benchmarks/dsr1_fp8_mi325x_docker.sh b/benchmarks/dsr1_fp8_mi325x_docker.sh index 72c89571d..a8cdf566a 100644 --- a/benchmarks/dsr1_fp8_mi325x_docker.sh +++ b/benchmarks/dsr1_fp8_mi325x_docker.sh @@ -1,13 +1,14 @@ #!/usr/bin/env bash -# ========= Required Env Vars ========= -# HF_TOKEN -# HF_HUB_CACHE +# === Required Env Vars === # MODEL # PORT # TP # CONC -# MAX_MODEL_LEN +# ISL +# OSL +# RANDOM_RANGE_RATIO +# RESULT_FILENAME # Reference # https://rocm.docs.amd.com/en/docs-7.0-docker/benchmark-docker/inference-sglang-deepseek-r1-fp8.html diff --git a/benchmarks/dsr1_fp8_mi325x_slurm.sh b/benchmarks/dsr1_fp8_mi325x_slurm.sh index 67e4cc394..fb5e07df9 100644 --- a/benchmarks/dsr1_fp8_mi325x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi325x_slurm.sh @@ -1,5 +1,14 @@ #!/usr/bin/bash +# === Required Env Vars === +# MODEL +# TP +# CONC +# ISL +# OSL +# RANDOM_RANGE_RATIO +# RESULT_FILENAME + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) diff --git a/benchmarks/dsr1_fp8_mi355x_docker.sh b/benchmarks/dsr1_fp8_mi355x_docker.sh index 128810b42..8c5038cee 100644 --- a/benchmarks/dsr1_fp8_mi355x_docker.sh +++ b/benchmarks/dsr1_fp8_mi355x_docker.sh @@ -1,13 +1,14 @@ #!/usr/bin/env bash -# ========= Required Env Vars ========= -# HF_TOKEN -# HF_HUB_CACHE +# === Required Env Vars === # MODEL # PORT # TP # CONC -# MAX_MODEL_LEN +# ISL +# OSL +# RANDOM_RANGE_RATIO +# RESULT_FILENAME # Reference # https://rocm.docs.amd.com/en/docs-7.0-docker/benchmark-docker/inference-sglang-deepseek-r1-fp8.html diff --git a/benchmarks/dsr1_fp8_mi355x_slurm.sh b/benchmarks/dsr1_fp8_mi355x_slurm.sh index fd6fe49fb..921f08a4c 100644 --- a/benchmarks/dsr1_fp8_mi355x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi355x_slurm.sh @@ -1,16 +1,13 @@ #!/usr/bin/env bash -# ========= Required Env Vars ========= -# HF_TOKEN -# HF_HUB_CACHE +# === Required Env Vars === # MODEL +# PORT +# TP +# CONC # ISL # OSL -# MAX_MODEL_LEN # RANDOM_RANGE_RATIO -# TP -# CONC -# PORT # RESULT_FILENAME export HF_MODULES_CACHE="/tmp/hf_modules_cache/" diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh index 1736701c4..4fbf4f50c 100644 --- a/benchmarks/gptoss_fp4_b200_docker.sh +++ b/benchmarks/gptoss_fp4_b200_docker.sh @@ -1,18 +1,16 @@ #!/usr/bin/env bash -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE +# === Required Env Vars === # MODEL +# PORT +# TP +# CONC # ISL # OSL # MAX_MODEL_LEN # RANDOM_RANGE_RATIO -# TP -# CONC # RESULT_FILENAME -# PORT_OFFSET +# NUM_PROMPTS nvidia-smi diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh index 7542cd1a0..44e9dbf4c 100644 --- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh @@ -1,16 +1,13 @@ #!/usr/bin/env bash -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE +# === Required Env Vars === # MODEL +# TP +# CONC # ISL # OSL # MAX_MODEL_LEN # RANDOM_RANGE_RATIO -# TP -# CONC # RESULT_FILENAME # PORT_OFFSET # DP_ATTENTION diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index f38ea3293..48b548e37 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -1,15 +1,13 @@ #!/usr/bin/env bash -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE +# === Required Env Vars === # MODEL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO +# PORT # TP # CONC # ISL # OSL +# RANDOM_RANGE_RATIO # RESULT_FILENAME diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh index 843219b95..a004f8892 100644 --- a/benchmarks/gptoss_fp4_h100_slurm.sh +++ b/benchmarks/gptoss_fp4_h100_slurm.sh @@ -1,17 +1,14 @@ #!/usr/bin/env bash -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE +# === Required Env Vars === # MODEL +# PORT +# TP +# CONC # ISL # OSL -# MAX_MODEL_LEN # RANDOM_RANGE_RATIO -# TP -# CONC # RESULT_FILENAME -# PORT_OFFSET echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh index dc29baf8d..970b7ad35 100644 --- a/benchmarks/gptoss_fp4_h200_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_slurm.sh @@ -1,16 +1,12 @@ #!/usr/bin/env bash -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE +# === Required Env Vars === # MODEL +# TP +# CONC # ISL # OSL -# MAX_MODEL_LEN # RANDOM_RANGE_RATIO -# TP -# CONC # RESULT_FILENAME # PORT_OFFSET diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh index 21d6ae02c..12a6af5b7 100644 --- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh @@ -1,16 +1,12 @@ #!/usr/bin/env bash -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE +# === Required Env Vars === # MODEL +# TP +# CONC # ISL # OSL -# MAX_MODEL_LEN # RANDOM_RANGE_RATIO -# TP -# CONC # RESULT_FILENAME # PORT_OFFSET # DP_ATTENTION diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh index 7d1f98226..50d86b52a 100644 --- a/benchmarks/gptoss_fp4_mi300x_docker.sh +++ b/benchmarks/gptoss_fp4_mi300x_docker.sh @@ -1,13 +1,15 @@ #!/usr/bin/env bash -# ========= Required Env Vars ========= -# HF_TOKEN -# HF_HUB_CACHE +# === Required Env Vars === # MODEL # PORT # TP # CONC +# ISL +# OSL # MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# RESULT_FILENAME # If the machine runs a MEC FW older than 177, RCCL # cannot reclaim some memory. diff --git a/benchmarks/gptoss_fp4_mi300x_slurm.sh b/benchmarks/gptoss_fp4_mi300x_slurm.sh index a9e164cc2..a86e66b3d 100644 --- a/benchmarks/gptoss_fp4_mi300x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi300x_slurm.sh @@ -1,16 +1,13 @@ #!/usr/bin/bash -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE +# === Required Env Vars === # MODEL +# TP +# CONC # ISL # OSL # MAX_MODEL_LEN # RANDOM_RANGE_RATIO -# TP -# CONC # RESULT_FILENAME echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" diff --git a/benchmarks/gptoss_fp4_mi325x_docker.sh b/benchmarks/gptoss_fp4_mi325x_docker.sh index 46462ad6d..2117f787e 100644 --- a/benchmarks/gptoss_fp4_mi325x_docker.sh +++ b/benchmarks/gptoss_fp4_mi325x_docker.sh @@ -1,13 +1,15 @@ #!/usr/bin/env bash -# ========= Required Env Vars ========= -# HF_TOKEN -# HF_HUB_CACHE +# === Required Env Vars === # MODEL # PORT # TP # CONC +# ISL +# OSL # MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# RESULT_FILENAME # If the machine runs a MEC FW older than 177, RCCL # cannot reclaim some memory. diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh index f15e6261c..56c7651ed 100644 --- a/benchmarks/gptoss_fp4_mi325x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh @@ -1,16 +1,13 @@ #!/usr/bin/bash -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE +# === Required Env Vars === # MODEL +# TP +# CONC # ISL # OSL # MAX_MODEL_LEN # RANDOM_RANGE_RATIO -# TP -# CONC # RESULT_FILENAME diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh index 0e54245d4..68fc59f8c 100644 --- a/benchmarks/gptoss_fp4_mi355x_docker.sh +++ b/benchmarks/gptoss_fp4_mi355x_docker.sh @@ -1,15 +1,16 @@ #!/usr/bin/env bash -# ========= Required Env Vars ========= -# HF_TOKEN -# HF_HUB_CACHE +# === Required Env Vars === # MODEL # PORT # TP # CONC +# ISL +# OSL # MAX_MODEL_LEN # RANDOM_RANGE_RATIO # RESULT_FILENAME +# NUM_PROMPTS cat > config.yaml << EOF compilation-config: '{"compile_sizes":[1,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98,100,102,104,106,108,110,112,114,116,118,120,122,124,126,128,256,512,1024,2048,8192] , "cudagraph_capture_sizes":[1,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98,100,102,104,106,108,110,112,114,116,118,120,122,124,126,128,136,144,152,160,168,176,184,192,200,208,216,224,232,240,248,256,264,272,280,288,296,304,312,320,328,336,344,352,360,368,376,384,392,400,408,416,424,432,440,448,456,464,472,480,488,496,504,512,520,528,536,544,552,560,568,576,584,592,600,608,616,624,632,640,648,656,664,672,680,688,696,704,712,720,728,736,744,752,760,768,776,784,792,800,808,816,824,832,840,848,856,864,872,880,888,896,904,912,920,928,936,944,952,960,968,976,984,992,1000,1008,1016,1024,2048,4096,8192] , "cudagraph_mode": "FULL_AND_PIECEWISE"}' diff --git a/benchmarks/gptoss_fp4_mi355x_slurm.sh b/benchmarks/gptoss_fp4_mi355x_slurm.sh index a2adf2952..342b7dde3 100644 --- a/benchmarks/gptoss_fp4_mi355x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi355x_slurm.sh @@ -1,16 +1,13 @@ #!/usr/bin/env bash -# ========= Required Env Vars ========= -# HF_TOKEN -# HF_HUB_CACHE +# === Required Env Vars === # MODEL +# PORT +# TP +# CONC # ISL # OSL -# MAX_MODEL_LEN # RANDOM_RANGE_RATIO -# TP -# CONC -# PORT # RESULT_FILENAME SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) From 6eb8285d0c886027affab84e885effa37306810a Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 21 Nov 2025 15:43:50 -0600 Subject: [PATCH 60/60] removing oci node rebase with main --- runners/launch_mi300x-oci.sh | 18 ------------------ 1 file changed, 18 deletions(-) delete mode 100644 runners/launch_mi300x-oci.sh diff --git a/runners/launch_mi300x-oci.sh b/runners/launch_mi300x-oci.sh deleted file mode 100644 index 33614a03c..000000000 --- a/runners/launch_mi300x-oci.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/usr/bin/bash - -HF_HUB_CACHE_MOUNT="$HOME/hf_hub_cache/" -PORT=8888 - -server_name="bmk-server" - -set -x -docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ ---privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \ ---cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ --v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \ --e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \ ---entrypoint=/bin/bash \ -$IMAGE \ -benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi300x_docker.sh"