From cd9cb64c41d3d4c861210b5442cac43b6f07e4d5 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 12 Nov 2025 14:19:08 -0600 Subject: [PATCH 001/214] initial poc --- benchmarks/gptoss_fp4_h100_docker.sh | 29 +++++++++++- benchmarks/gptoss_fp4_h100_slurm.sh | 1 - runners/launch_h100-cr.sh | 68 ++++++++++++++-------------- 3 files changed, 62 insertions(+), 36 deletions(-) diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index a8bb57c16..39a5abf63 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -7,6 +7,9 @@ # MAX_MODEL_LEN # TP # CONC +# ISL +# OSL + cat > config.yaml << EOF compilation-config: '{"cudagraph_mode":"PIECEWISE"}' @@ -18,6 +21,7 @@ max-model-len: 10240 EOF export PYTHONNOUSERSITE=1 +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) set -x vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ @@ -25,4 +29,27 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --gpu-memory-utilization=0.9 \ --tensor-parallel-size=$TP \ --max-num-seqs=$CONC \ ---disable-log-requests +--disable-log-requests > $SERVER_LOG 2>&1 & + +set +x +while IFS= read -r line; do + printf '%s\n' "$line" + if [[ "$line" =~ Application\ startup\ complete ]]; then + break + fi +done < <(tail -F -n0 "$SERVER_LOG") + +pip install -q datasets pandas +git clone https://github.com/kimbochen/bench_serving.git +set -x +python3 bench_serving/benchmark_serving.py \ +--model=$MODEL \ +--backend=vllm \ +--base-url=\"http://localhost:$PORT\" \ +--dataset-name=random \ +--random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ +--num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \ +--request-rate=inf --ignore-eos \ +--save-result --percentile-metrics='ttft,tpot,itl,e2el' \ +--result-dir=/workspace/ \ +--result-filename=$RESULT_FILENAME.json" \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh index d2819b5b3..e9092703a 100644 --- a/benchmarks/gptoss_fp4_h100_slurm.sh +++ b/benchmarks/gptoss_fp4_h100_slurm.sh @@ -3,7 +3,6 @@ # === Required Env Vars === # HF_TOKEN # HF_HUB_CACHE -# IMAGE # MODEL # ISL # OSL diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh index 47b350128..1eb58c32e 100644 --- a/runners/launch_h100-cr.sh +++ b/runners/launch_h100-cr.sh @@ -4,7 +4,7 @@ HF_HUB_CACHE_MOUNT="/home/ubuntu/hf_hub_cache/" PORT=8888 server_name="bmk-server" -client_name="bmk-client" +# client_name="bmk-client" set -x docker run --rm -d --network=host --name=$server_name \ @@ -17,38 +17,38 @@ docker run --rm -d --network=host --name=$server_name \ $IMAGE \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_h100_docker.sh" -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" =~ Application\ startup\ complete ]]; then - break - fi -done < <(docker logs -f --tail=0 $server_name 2>&1) - -if ! docker ps --format "{{.Names}}" | grep -q "$server_name"; then - echo "Server container launch failed." - exit 1 -fi - -git clone https://github.com/kimbochen/bench_serving.git - -set -x -docker run --rm --network=host --name=$client_name \ --v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ ---entrypoint=/bin/bash \ -$IMAGE \ --lc "pip install -q datasets pandas && \ -python3 bench_serving/benchmark_serving.py \ ---model=$MODEL \ ---backend=vllm \ ---base-url=\"http://localhost:$PORT\" \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics='ttft,tpot,itl,e2el' \ ---result-dir=/workspace/ \ ---result-filename=$RESULT_FILENAME.json" +# set +x +# while IFS= read -r line; do +# printf '%s\n' "$line" +# if [[ "$line" =~ Application\ startup\ complete ]]; then +# break +# fi +# done < <(docker logs -f --tail=0 $server_name 2>&1) + +# if ! docker ps --format "{{.Names}}" | grep -q "$server_name"; then +# echo "Server container launch failed." +# exit 1 +# fi + +# git clone https://github.com/kimbochen/bench_serving.git + +# set -x +# docker run --rm --network=host --name=$client_name \ +# -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ +# -e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ +# --entrypoint=/bin/bash \ +# $IMAGE \ +# -lc "pip install -q datasets pandas && \ +# python3 bench_serving/benchmark_serving.py \ +# --model=$MODEL \ +# --backend=vllm \ +# --base-url=\"http://localhost:$PORT\" \ +# --dataset-name=random \ +# --random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ +# --num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \ +# --request-rate=inf --ignore-eos \ +# --save-result --percentile-metrics='ttft,tpot,itl,e2el' \ +# --result-dir=/workspace/ \ +# --result-filename=$RESULT_FILENAME.json" docker stop $server_name From 00ac64a55cec31c1c6e8761daba271d88a5e6c80 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 12 Nov 2025 14:45:02 -0600 Subject: [PATCH 002/214] remove -d flag when launching docker container --- runners/launch_h100-cr.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh index 1eb58c32e..51def9743 100644 --- a/runners/launch_h100-cr.sh +++ b/runners/launch_h100-cr.sh @@ -7,7 +7,7 @@ server_name="bmk-server" # client_name="bmk-client" set -x -docker run --rm -d --network=host --name=$server_name \ +docker run --rm --network=host --name=$server_name \ --runtime=nvidia --gpus=all --ipc=host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ From e38b38aa96d5a9eb09c1ad09074c15450159ea41 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 12 Nov 2025 14:50:32 -0600 Subject: [PATCH 003/214] syntax error --- benchmarks/gptoss_fp4_h100_docker.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index 39a5abf63..3700ea357 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -52,4 +52,4 @@ python3 bench_serving/benchmark_serving.py \ --request-rate=inf --ignore-eos \ --save-result --percentile-metrics='ttft,tpot,itl,e2el' \ --result-dir=/workspace/ \ ---result-filename=$RESULT_FILENAME.json" \ No newline at end of file +--result-filename=$RESULT_FILENAME.json \ No newline at end of file From 66eae81b19aacab22a59cf2121f1878c7ff91338 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 12 Nov 2025 14:58:35 -0600 Subject: [PATCH 004/214] compatibility fixes --- benchmarks/gptoss_fp4_h100_docker.sh | 11 ++++------- runners/launch_h100-cr.sh | 2 +- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index 3700ea357..aee233793 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -5,6 +5,7 @@ # HF_HUB_CACHE # MODEL # MAX_MODEL_LEN +# RANDOM_RANGE_RATIO # TP # CONC # ISL @@ -21,7 +22,6 @@ max-model-len: 10240 EOF export PYTHONNOUSERSITE=1 -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) set -x vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ @@ -32,12 +32,9 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --disable-log-requests > $SERVER_LOG 2>&1 & set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" =~ Application\ startup\ complete ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") +until curl --output /dev/null --silent --head --fail http://localhost:$PORT; do + sleep 5 +done pip install -q datasets pandas git clone https://github.com/kimbochen/bench_serving.git diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh index 51def9743..8553d9b59 100644 --- a/runners/launch_h100-cr.sh +++ b/runners/launch_h100-cr.sh @@ -11,7 +11,7 @@ docker run --rm --network=host --name=$server_name \ --runtime=nvidia --gpus=all --ipc=host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT \ +-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \ -e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ --entrypoint=/bin/bash \ $IMAGE \ From fdec241c711313934bc112453739ade661e6c01f Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 12 Nov 2025 14:59:51 -0600 Subject: [PATCH 005/214] add correct endpoint prefix --- benchmarks/gptoss_fp4_h100_docker.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index aee233793..80af1b8e0 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -32,7 +32,7 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --disable-log-requests > $SERVER_LOG 2>&1 & set +x -until curl --output /dev/null --silent --head --fail http://localhost:$PORT; do +until curl --output /dev/null --silent --head --fail http://localhost:$PORT/health; do sleep 5 done From 08de857790de7726a2b1864fdab2de6c1a573768 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 12 Nov 2025 15:06:41 -0600 Subject: [PATCH 006/214] remove reference env var --- benchmarks/gptoss_fp4_h100_docker.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index 80af1b8e0..d914e6a06 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -29,7 +29,7 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --gpu-memory-utilization=0.9 \ --tensor-parallel-size=$TP \ --max-num-seqs=$CONC \ ---disable-log-requests > $SERVER_LOG 2>&1 & +--disable-log-requests set +x until curl --output /dev/null --silent --head --fail http://localhost:$PORT/health; do From 06231ee3a78ee37eca6a7ed4c58a770e8550041c Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 12 Nov 2025 15:13:14 -0600 Subject: [PATCH 007/214] run vllm serve in background --- benchmarks/gptoss_fp4_h100_docker.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index d914e6a06..e4efb03ec 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -29,10 +29,10 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --gpu-memory-utilization=0.9 \ --tensor-parallel-size=$TP \ --max-num-seqs=$CONC \ ---disable-log-requests +--disable-log-requests & set +x -until curl --output /dev/null --silent --head --fail http://localhost:$PORT/health; do +until curl --output /dev/null --silent --fail http://localhost:$PORT/health; do sleep 5 done From 21ed06746baa45be52f65b820110bf7684d2cf01 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 12 Nov 2025 15:18:56 -0600 Subject: [PATCH 008/214] unescape sequences --- benchmarks/gptoss_fp4_h100_docker.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index e4efb03ec..5ba23d1ff 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -42,7 +42,7 @@ set -x python3 bench_serving/benchmark_serving.py \ --model=$MODEL \ --backend=vllm \ ---base-url=\"http://localhost:$PORT\" \ +--base-url=http://localhost:$PORT \ --dataset-name=random \ --random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ --num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \ From 65ef1f0b6232c5a513bb6bb1eb37c08d23038e58 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 12 Nov 2025 15:38:27 -0600 Subject: [PATCH 009/214] stop vllm to stdout after it stops --- benchmarks/gptoss_fp4_h100_docker.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index 5ba23d1ff..ae5fb2a9f 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -31,11 +31,17 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --max-num-seqs=$CONC \ --disable-log-requests & +SERVER_PID=$! set +x +tail -f /tmp/vllm_server.log & +TAIL_PID=$! + until curl --output /dev/null --silent --fail http://localhost:$PORT/health; do sleep 5 done +kill $TAIL_PID 2>/dev/null + pip install -q datasets pandas git clone https://github.com/kimbochen/bench_serving.git set -x From cb557214884bcc93af777098f4b89ad1c40f3745 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 12 Nov 2025 15:41:45 -0600 Subject: [PATCH 010/214] stop vllm to stdout after it stops pt 2 --- benchmarks/gptoss_fp4_h100_docker.sh | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index ae5fb2a9f..4ef463bf1 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -29,18 +29,16 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --gpu-memory-utilization=0.9 \ --tensor-parallel-size=$TP \ --max-num-seqs=$CONC \ ---disable-log-requests & +--disable-log-requests 2>&1 | tee $(mktemp /tmp/server-XXXXXX.log) & -SERVER_PID=$! +VLLM_PID=$! set +x -tail -f /tmp/vllm_server.log & -TAIL_PID=$! until curl --output /dev/null --silent --fail http://localhost:$PORT/health; do sleep 5 done -kill $TAIL_PID 2>/dev/null +pkill -P $$ tee 2>/dev/null pip install -q datasets pandas git clone https://github.com/kimbochen/bench_serving.git From 788b7f1510031ecc98493b2c3e16ca14f16bb3bc Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 12 Nov 2025 15:47:37 -0600 Subject: [PATCH 011/214] get rid of docker stop as no longer in detatched --- runners/launch_h100-cr.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh index 8553d9b59..a34b31c88 100644 --- a/runners/launch_h100-cr.sh +++ b/runners/launch_h100-cr.sh @@ -51,4 +51,4 @@ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_h100_docker.sh" # --result-dir=/workspace/ \ # --result-filename=$RESULT_FILENAME.json" -docker stop $server_name +# docker stop $server_name From a87e17496191406f8ed03245e26e077f5ff2661e Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 12 Nov 2025 16:01:09 -0600 Subject: [PATCH 012/214] clone bench serving to tmp dir --- benchmarks/gptoss_fp4_h100_docker.sh | 5 +++-- runners/launch_h100-cr.sh | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index 4ef463bf1..5420c220d 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -41,9 +41,10 @@ done pkill -P $$ tee 2>/dev/null pip install -q datasets pandas -git clone https://github.com/kimbochen/bench_serving.git +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR set -x -python3 bench_serving/benchmark_serving.py \ +python3 $BENCH_SERVING_DIR/bench_serving/benchmark_serving.py \ --model=$MODEL \ --backend=vllm \ --base-url=http://localhost:$PORT \ diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh index a34b31c88..18c791614 100644 --- a/runners/launch_h100-cr.sh +++ b/runners/launch_h100-cr.sh @@ -12,7 +12,7 @@ docker run --rm --network=host --name=$server_name \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \ --e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ +-e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ --entrypoint=/bin/bash \ $IMAGE \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_h100_docker.sh" From c1d0a796e0fb75dd4369b0589997f5a9d56853d1 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 12 Nov 2025 16:07:27 -0600 Subject: [PATCH 013/214] clone bench serving to tmp dir pt 2 --- benchmarks/gptoss_fp4_h100_docker.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index 5420c220d..2c8bfb3c5 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -44,7 +44,7 @@ pip install -q datasets pandas BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR set -x -python3 $BENCH_SERVING_DIR/bench_serving/benchmark_serving.py \ +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model=$MODEL \ --backend=vllm \ --base-url=http://localhost:$PORT \ From 4823afa516348ae4f1aac230d8bae751c1e2a91a Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 12 Nov 2025 16:14:36 -0600 Subject: [PATCH 014/214] add explanatory comment --- benchmarks/gptoss_fp4_h100_docker.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index 2c8bfb3c5..6229bed85 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -31,13 +31,12 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --max-num-seqs=$CONC \ --disable-log-requests 2>&1 | tee $(mktemp /tmp/server-XXXXXX.log) & +# Show server logs til' it is up, then stop showing VLLM_PID=$! set +x - until curl --output /dev/null --silent --fail http://localhost:$PORT/health; do sleep 5 done - pkill -P $$ tee 2>/dev/null pip install -q datasets pandas From d52299fe46701c857925e507e39e62c134749701 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 12 Nov 2025 16:35:43 -0600 Subject: [PATCH 015/214] cleaning up --- runners/launch_h100-cr.sh | 37 ------------------------------------- 1 file changed, 37 deletions(-) diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh index 18c791614..d1ddc26de 100644 --- a/runners/launch_h100-cr.sh +++ b/runners/launch_h100-cr.sh @@ -4,7 +4,6 @@ HF_HUB_CACHE_MOUNT="/home/ubuntu/hf_hub_cache/" PORT=8888 server_name="bmk-server" -# client_name="bmk-client" set -x docker run --rm --network=host --name=$server_name \ @@ -16,39 +15,3 @@ docker run --rm --network=host --name=$server_name \ --entrypoint=/bin/bash \ $IMAGE \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_h100_docker.sh" - -# set +x -# while IFS= read -r line; do -# printf '%s\n' "$line" -# if [[ "$line" =~ Application\ startup\ complete ]]; then -# break -# fi -# done < <(docker logs -f --tail=0 $server_name 2>&1) - -# if ! docker ps --format "{{.Names}}" | grep -q "$server_name"; then -# echo "Server container launch failed." -# exit 1 -# fi - -# git clone https://github.com/kimbochen/bench_serving.git - -# set -x -# docker run --rm --network=host --name=$client_name \ -# -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -# -e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ -# --entrypoint=/bin/bash \ -# $IMAGE \ -# -lc "pip install -q datasets pandas && \ -# python3 bench_serving/benchmark_serving.py \ -# --model=$MODEL \ -# --backend=vllm \ -# --base-url=\"http://localhost:$PORT\" \ -# --dataset-name=random \ -# --random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ -# --num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \ -# --request-rate=inf --ignore-eos \ -# --save-result --percentile-metrics='ttft,tpot,itl,e2el' \ -# --result-dir=/workspace/ \ -# --result-filename=$RESULT_FILENAME.json" - -# docker stop $server_name From 85de6e752391380ec328ff119798cfab06695970 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 09:13:56 -0600 Subject: [PATCH 016/214] cleaning up --- benchmarks/gptoss_fp4_mi355x_docker.sh | 38 +++++++++++++++++++++++++- runners/launch_mi355x-amd.sh | 8 +++--- 2 files changed, 41 insertions(+), 5 deletions(-) diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh index 103e77fe3..de5ce9ce7 100644 --- a/benchmarks/gptoss_fp4_mi355x_docker.sh +++ b/benchmarks/gptoss_fp4_mi355x_docker.sh @@ -30,4 +30,40 @@ vllm serve $MODEL --port $PORT \ --block-size=64 \ --no-enable-prefix-caching \ --disable-log-requests \ ---async-scheduling +--async-scheduling | tee $(mktemp /tmp/server-XXXXXX.log) & + +# Show server logs til' it is up, then stop showing +VLLM_PID=$! +set +x +until curl --output /dev/null --silent --fail http://localhost:$PORT/health; do + sleep 5 +done +pkill -P $$ tee 2>/dev/null + +if [[ "$MODEL" == "amd/DeepSeek-R1-0528-MXFP4-Preview" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then + if [[ "$OSL" == "8192" ]]; then + NUM_PROMPTS=$(( CONC * 20 )) + else + NUM_PROMPTS=$(( CONC * 50 )) + fi +else + NUM_PROMPTS=$(( CONC * 10 )) +fi + +git clone https://github.com/kimbochen/bench_serving.git + +set -x +docker run --rm --network=$network_name --name=$client_name \ +-v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ +-e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ +--entrypoint=python3 \ +$IMAGE \ +bench_serving/benchmark_serving.py \ +--model=$MODEL --backend=vllm --base-url="http://localhost:$PORT" \ +--dataset-name=random \ +--random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ +--num-prompts=$NUM_PROMPTS \ +--max-concurrency=$CONC \ +--request-rate=inf --ignore-eos \ +--save-result --percentile-metrics="ttft,tpot,itl,e2el" \ +--result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json diff --git a/runners/launch_mi355x-amd.sh b/runners/launch_mi355x-amd.sh index 87ee8cbd2..b3cecf6e4 100644 --- a/runners/launch_mi355x-amd.sh +++ b/runners/launch_mi355x-amd.sh @@ -17,14 +17,14 @@ HF_HUB_CACHE_MOUNT="/nfsdata/hf_hub_cache-1/" # Temp solution PORT=8888 -network_name="bmk-net" +# network_name="bmk-net" server_name="bmk-server" -client_name="bmk-client" +# client_name="bmk-client" -docker network create $network_name +# docker network create $network_name set -x -docker run --rm -d --ipc=host --shm-size=16g --network=$network_name --name=$server_name \ +docker run --rm -d --ipc=host --shm-size=16g --network=host --name=$server_name \ --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ From 48f7588da8b40be15bf190e9b4926f08390a17e6 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 10:57:36 -0600 Subject: [PATCH 017/214] adding mi355x refactor --- benchmarks/dsr1_fp8_mi355x_docker.sh | 33 ++++++++++- benchmarks/gptoss_fp4_h100_docker.sh | 1 - benchmarks/gptoss_fp4_mi355x_docker.sh | 24 ++------ runners/launch_mi355x-amd.sh | 78 +++++++++++++------------- 4 files changed, 76 insertions(+), 60 deletions(-) diff --git a/benchmarks/dsr1_fp8_mi355x_docker.sh b/benchmarks/dsr1_fp8_mi355x_docker.sh index f39a8dbbd..baad70fd8 100644 --- a/benchmarks/dsr1_fp8_mi355x_docker.sh +++ b/benchmarks/dsr1_fp8_mi355x_docker.sh @@ -24,5 +24,36 @@ python3 -m sglang.launch_server \ --mem-fraction-static 0.8 --disable-radix-cache \ --num-continuous-decode-steps 4 \ --max-prefill-tokens 196608 \ - --cuda-graph-max-bs 128 + --cuda-graph-max-bs 128 | tee $(mktemp /tmp/server-XXXXXX.log) & + +set +x +until curl --output /dev/null --silent --fail http://localhost:$PORT/health; do + sleep 5 +done +pkill -P $$ tee 2>/dev/null + +if [[ "$MODEL" == "amd/DeepSeek-R1-0528-MXFP4-Preview" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then + if [[ "$OSL" == "8192" ]]; then + NUM_PROMPTS=$(( CONC * 20 )) + else + NUM_PROMPTS=$(( CONC * 50 )) + fi +else + NUM_PROMPTS=$(( CONC * 10 )) +fi + +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +set -x +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ +--model=$MODEL --backend=vllm --base-url="http://localhost:$PORT" \ +--dataset-name=random \ +--random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ +--num-prompts=$NUM_PROMPTS \ +--max-concurrency=$CONC \ +--request-rate=inf --ignore-eos \ +--save-result --percentile-metrics="ttft,tpot,itl,e2el" \ +--result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json + + diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index 6229bed85..1b4453be3 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -32,7 +32,6 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --disable-log-requests 2>&1 | tee $(mktemp /tmp/server-XXXXXX.log) & # Show server logs til' it is up, then stop showing -VLLM_PID=$! set +x until curl --output /dev/null --silent --fail http://localhost:$PORT/health; do sleep 5 diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh index de5ce9ce7..533f5e212 100644 --- a/benchmarks/gptoss_fp4_mi355x_docker.sh +++ b/benchmarks/gptoss_fp4_mi355x_docker.sh @@ -8,6 +8,8 @@ # TP # CONC # MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# RESULT_FILENAME cat > config.yaml << EOF compilation-config: '{"compile_sizes":[1,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98,100,102,104,106,108,110,112,114,116,118,120,122,124,126,128,256,512,1024,2048,8192] , "cudagraph_capture_sizes":[1,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98,100,102,104,106,108,110,112,114,116,118,120,122,124,126,128,136,144,152,160,168,176,184,192,200,208,216,224,232,240,248,256,264,272,280,288,296,304,312,320,328,336,344,352,360,368,376,384,392,400,408,416,424,432,440,448,456,464,472,480,488,496,504,512,520,528,536,544,552,560,568,576,584,592,600,608,616,624,632,640,648,656,664,672,680,688,696,704,712,720,728,736,744,752,760,768,776,784,792,800,808,816,824,832,840,848,856,864,872,880,888,896,904,912,920,928,936,944,952,960,968,976,984,992,1000,1008,1016,1024,2048,4096,8192] , "cudagraph_mode": "FULL_AND_PIECEWISE"}' @@ -33,32 +35,16 @@ vllm serve $MODEL --port $PORT \ --async-scheduling | tee $(mktemp /tmp/server-XXXXXX.log) & # Show server logs til' it is up, then stop showing -VLLM_PID=$! set +x until curl --output /dev/null --silent --fail http://localhost:$PORT/health; do sleep 5 done pkill -P $$ tee 2>/dev/null -if [[ "$MODEL" == "amd/DeepSeek-R1-0528-MXFP4-Preview" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then - if [[ "$OSL" == "8192" ]]; then - NUM_PROMPTS=$(( CONC * 20 )) - else - NUM_PROMPTS=$(( CONC * 50 )) - fi -else - NUM_PROMPTS=$(( CONC * 10 )) -fi - -git clone https://github.com/kimbochen/bench_serving.git - +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR set -x -docker run --rm --network=$network_name --name=$client_name \ --v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ ---entrypoint=python3 \ -$IMAGE \ -bench_serving/benchmark_serving.py \ +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model=$MODEL --backend=vllm --base-url="http://localhost:$PORT" \ --dataset-name=random \ --random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ diff --git a/runners/launch_mi355x-amd.sh b/runners/launch_mi355x-amd.sh index b3cecf6e4..009a53108 100644 --- a/runners/launch_mi355x-amd.sh +++ b/runners/launch_mi355x-amd.sh @@ -24,52 +24,52 @@ server_name="bmk-server" # docker network create $network_name set -x -docker run --rm -d --ipc=host --shm-size=16g --network=host --name=$server_name \ +docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \ --e ISL -e OSL \ +-e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \ --entrypoint=/bin/bash \ $IMAGE \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi355x_docker.sh" -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" =~ Application\ startup\ complete ]]; then - break - fi -done < <(docker logs -f --tail=0 $server_name 2>&1) +# set +x +# while IFS= read -r line; do +# printf '%s\n' "$line" +# if [[ "$line" =~ Application\ startup\ complete ]]; then +# break +# fi +# done < <(docker logs -f --tail=0 $server_name 2>&1) -if [[ "$MODEL" == "amd/DeepSeek-R1-0528-MXFP4-Preview" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then - if [[ "$OSL" == "8192" ]]; then - NUM_PROMPTS=$(( CONC * 20 )) - else - NUM_PROMPTS=$(( CONC * 50 )) - fi -else - NUM_PROMPTS=$(( CONC * 10 )) -fi +# if [[ "$MODEL" == "amd/DeepSeek-R1-0528-MXFP4-Preview" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then +# if [[ "$OSL" == "8192" ]]; then +# NUM_PROMPTS=$(( CONC * 20 )) +# else +# NUM_PROMPTS=$(( CONC * 50 )) +# fi +# else +# NUM_PROMPTS=$(( CONC * 10 )) +# fi -git clone https://github.com/kimbochen/bench_serving.git +# git clone https://github.com/kimbochen/bench_serving.git -set -x -docker run --rm --network=$network_name --name=$client_name \ --v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ ---entrypoint=python3 \ -$IMAGE \ -bench_serving/benchmark_serving.py \ ---model=$MODEL --backend=vllm --base-url="http://$server_name:$PORT" \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$NUM_PROMPTS \ ---max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics="ttft,tpot,itl,e2el" \ ---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json +# set -x +# docker run --rm --network=$network_name --name=$client_name \ +# -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ +# -e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ +# --entrypoint=python3 \ +# $IMAGE \ +# bench_serving/benchmark_serving.py \ +# --model=$MODEL --backend=vllm --base-url="http://$server_name:$PORT" \ +# --dataset-name=random \ +# --random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ +# --num-prompts=$NUM_PROMPTS \ +# --max-concurrency=$CONC \ +# --request-rate=inf --ignore-eos \ +# --save-result --percentile-metrics="ttft,tpot,itl,e2el" \ +# --result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json if ls gpucore.* 1> /dev/null 2>&1; then echo "gpucore files exist. not good" @@ -77,8 +77,8 @@ if ls gpucore.* 1> /dev/null 2>&1; then fi -while [ -n "$(docker ps -aq)" ]; do - docker stop $server_name - docker network rm $network_name - sleep 5 -done +# while [ -n "$(docker ps -aq)" ]; do +# docker stop $server_name +# # docker network rm $network_name +# sleep 5 +# done From faec31e7d9a7cab97d396e9f2533e8e8b8690728 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 11:37:44 -0600 Subject: [PATCH 018/214] adding h200 initial refactor --- benchmarks/gptoss_fp4_h200_trt_slurm.sh | 34 +++++++++++++++++-------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh index c148a3cb7..7e411b05a 100644 --- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh @@ -19,7 +19,7 @@ echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" hf download $MODEL -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +# SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=$(( 8888 + $PORT_OFFSET )) @@ -45,20 +45,32 @@ stream_interval: 20 EOF #mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens $MAX_MODEL_LEN --num_postprocess_workers 2 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 & -mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --max_batch_size $CONC --max_num_tokens 20000 --backend pytorch --extra_llm_api_options gptoss-config.yml --ep_size=$EP_SIZE --trust_remote_code --gpus_per_node 8 --host 0.0.0.0 --port $PORT --tp_size=$TP --pp_size=1 > $SERVER_LOG 2>&1 & - +mpirun -n 1 --oversubscribe --allow-run-as-root \ +trtllm-serve $MODEL \ +--max_batch_size $CONC \ +--max_num_tokens 20000 \ +--backend pytorch \ +--extra_llm_api_options gptoss-config.yml \ +--ep_size=$EP_SIZE \ +--trust_remote_code \ +--gpus_per_node 8 \ +--host 0.0.0.0 \ +--port $PORT \ +--tp_size=$TP \ +--pp_size=1 \ +2>&1 | tee $(mktemp /tmp/server-XXXXXX.log) & +# Show server logs til' it is up, then stop showing set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +pkill -P $$ tee 2>/dev/null set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model $MODEL --backend openai \ --base-url http://0.0.0.0:$PORT \ --dataset-name random \ From 1ef1b23a56287f129936a1c82f9d594db9410f7a Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 15:46:11 -0600 Subject: [PATCH 019/214] different way to see server logs --- benchmarks/gptoss_fp4_h200_trt_slurm.sh | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh index 7e411b05a..ac084ef13 100644 --- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh @@ -44,6 +44,8 @@ print_iter_log: true stream_interval: 20 EOF +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) + #mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens $MAX_MODEL_LEN --num_postprocess_workers 2 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 & mpirun -n 1 --oversubscribe --allow-run-as-root \ trtllm-serve $MODEL \ @@ -58,14 +60,18 @@ trtllm-serve $MODEL \ --port $PORT \ --tp_size=$TP \ --pp_size=1 \ -2>&1 | tee $(mktemp /tmp/server-XXXXXX.log) & +> $SERVER_LOG 2>&1 & + +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! # Show server logs til' it is up, then stop showing set +x until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do sleep 5 done -pkill -P $$ tee 2>/dev/null +kill $TAIL_PID set -x BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) From 75523ee0b8f56e252b7f209a01a52743047e2913 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 15:59:58 -0600 Subject: [PATCH 020/214] cleanup --- benchmarks/dsr1_fp8_h200_trt_slurm.sh | 18 ++++++++++-------- benchmarks/gptoss_fp4_h100_docker.sh | 13 ++++++++----- benchmarks/gptoss_fp4_h100_slurm.sh | 18 ++++++++++-------- benchmarks/gptoss_fp4_h200_trt_slurm.sh | 7 +------ benchmarks/gptoss_fp4_mi355x_slurm.sh | 18 ++++++++++-------- 5 files changed, 39 insertions(+), 35 deletions(-) diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh index 7b566c0ab..58d7e9724 100644 --- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh @@ -71,17 +71,19 @@ PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \ > $SERVER_LOG 2>&1 & +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID -git clone https://github.com/kimbochen/bench_serving.git set -x -python3 bench_serving/benchmark_serving.py \ +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model $MODEL --backend openai \ --base-url http://0.0.0.0:$PORT \ --dataset-name random \ diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index 1b4453be3..ae889474c 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -22,6 +22,7 @@ max-model-len: 10240 EOF export PYTHONNOUSERSITE=1 +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) set -x vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ @@ -29,19 +30,21 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --gpu-memory-utilization=0.9 \ --tensor-parallel-size=$TP \ --max-num-seqs=$CONC \ ---disable-log-requests 2>&1 | tee $(mktemp /tmp/server-XXXXXX.log) & +--disable-log-requests > $SERVER_LOG 2>&1 & -# Show server logs til' it is up, then stop showing +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! set +x -until curl --output /dev/null --silent --fail http://localhost:$PORT/health; do +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do sleep 5 done -pkill -P $$ tee 2>/dev/null +kill $TAIL_PID pip install -q datasets pandas +set -x BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -set -x python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model=$MODEL \ --backend=vllm \ diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh index e9092703a..d82bebf72 100644 --- a/benchmarks/gptoss_fp4_h100_slurm.sh +++ b/benchmarks/gptoss_fp4_h100_slurm.sh @@ -35,18 +35,20 @@ PYTHONNOUSERSITE=1 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --max-num-seqs=$CONC \ --disable-log-requests > $SERVER_LOG 2>&1 & +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID pip install -q datasets pandas -git clone https://github.com/kimbochen/bench_serving.git set -x -python3 bench_serving/benchmark_serving.py \ +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model=$MODEL \ --backend=vllm \ --base-url="http://0.0.0.0:$PORT" \ diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh index ac084ef13..0927a0d61 100644 --- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh @@ -19,7 +19,7 @@ echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" hf download $MODEL -# SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=$(( 8888 + $PORT_OFFSET )) @@ -44,9 +44,6 @@ print_iter_log: true stream_interval: 20 EOF -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) - -#mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens $MAX_MODEL_LEN --num_postprocess_workers 2 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 & mpirun -n 1 --oversubscribe --allow-run-as-root \ trtllm-serve $MODEL \ --max_batch_size $CONC \ @@ -65,8 +62,6 @@ trtllm-serve $MODEL \ # Show logs until server is ready tail -f $SERVER_LOG & TAIL_PID=$! - -# Show server logs til' it is up, then stop showing set +x until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do sleep 5 diff --git a/benchmarks/gptoss_fp4_mi355x_slurm.sh b/benchmarks/gptoss_fp4_mi355x_slurm.sh index 657bc1fdf..1fcba771f 100644 --- a/benchmarks/gptoss_fp4_mi355x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi355x_slurm.sh @@ -38,17 +38,19 @@ vllm serve $MODEL --port $PORT \ --disable-log-requests \ --async-scheduling > $SERVER_LOG 2>&1 & +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model $MODEL --backend vllm \ --base-url "http://0.0.0.0:$PORT" \ --dataset-name random \ From 25366523662ce8a6ae7efeb36f289636f189d125 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 16:39:39 -0600 Subject: [PATCH 021/214] now fail if server fails --- benchmarks/dsr1_fp8_h200_trt_slurm.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh index 58d7e9724..a44769bc6 100644 --- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh @@ -69,6 +69,7 @@ PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \ --tp_size=$TP --ep_size=$EP_SIZE \ --extra_llm_api_options=$EXTRA_CONFIG_FILE \ > $SERVER_LOG 2>&1 & +SERVER_PID=$! # Show logs until server is ready @@ -76,6 +77,10 @@ tail -f $SERVER_LOG & TAIL_PID=$! set +x until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + if ! kill -0 $SERVER_PID 2>/dev/null; then + echo "Server died before becoming healthy. Exiting." + exit 1 + fi sleep 5 done kill $TAIL_PID From 2d58f0df37f4ac456e5f0fff95fbc16d3df4a7e5 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 17:10:51 -0600 Subject: [PATCH 022/214] starting on b200 --- benchmarks/gptoss_fp4_b200_docker.sh | 28 +++++++++++++++++++++-- runners/launch_b200-tg.sh | 34 +--------------------------- 2 files changed, 27 insertions(+), 35 deletions(-) diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh index fd6ac15c5..28f3d29cf 100644 --- a/benchmarks/gptoss_fp4_b200_docker.sh +++ b/benchmarks/gptoss_fp4_b200_docker.sh @@ -43,8 +43,32 @@ export TORCH_CUDA_ARCH_LIST="10.0" export VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB='{"2":32,"4":32,"8":8}' export PYTHONNOUSERSITE=1 export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1 +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) set -x vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \ ---gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs 512 \ ---disable-log-requests +--gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC \ +--disable-log-requests > $SERVER_LOG 2>&1 & + +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! +set +x +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID + +pip install -q datasets pandas +set -x +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ +--model $MODEL --backend vllm --base-url http://localhost:$PORT \ +--dataset-name random \ +--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ +--num-prompts $(( $CONC * 10 )) \ +--max-concurrency $CONC \ +--request-rate inf --ignore-eos \ +--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ +--result-dir /workspace/ --result-filename $RESULT_FILENAME.json \ No newline at end of file diff --git a/runners/launch_b200-tg.sh b/runners/launch_b200-tg.sh index 9f313396c..97e975a64 100644 --- a/runners/launch_b200-tg.sh +++ b/runners/launch_b200-tg.sh @@ -5,7 +5,6 @@ FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') PORT=8888 server_name="bmk-server" -client_name="bmk-client" set -x docker run --rm -d --network host --name $server_name \ @@ -14,38 +13,7 @@ docker run --rm -d --network host --name $server_name \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \ -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ +-e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ --entrypoint=/bin/bash \ $(echo "$IMAGE" | sed 's/#/\//') \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh" - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" =~ Application\ startup\ complete ]]; then - break - fi -done < <(docker logs -f --tail=0 $server_name 2>&1) - -git clone https://github.com/kimbochen/bench_serving.git - -set -x -docker run --rm --network host --name $client_name \ --v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ ---entrypoint=/bin/bash \ -$(echo "$IMAGE" | sed 's/#/\//') \ --lc "pip install -q datasets pandas && \ -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend vllm --base-url http://localhost:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) \ ---max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ --result-filename $RESULT_FILENAME.json" - -while [ -n "$(docker ps -aq)" ]; do - docker stop $server_name - sleep 5 -done From f5cf4a7167687ac86d9019edcf21cbc01631ab7c Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 17:20:53 -0600 Subject: [PATCH 023/214] doign b200 --- benchmarks/dsr1_fp4_b200_docker.sh | 27 ++++++++++++++++++++++- benchmarks/dsr1_fp4_b200_trt_slurm.sh | 18 +++++++++------- benchmarks/dsr1_fp8_b200_docker.sh | 26 +++++++++++++++++++++- benchmarks/dsr1_fp8_b200_trt_slurm.sh | 31 +++++++++++++++------------ benchmarks/gptoss_fp4_b200_docker.sh | 2 ++ 5 files changed, 80 insertions(+), 24 deletions(-) diff --git a/benchmarks/dsr1_fp4_b200_docker.sh b/benchmarks/dsr1_fp4_b200_docker.sh index 3c8232072..6b2112478 100644 --- a/benchmarks/dsr1_fp4_b200_docker.sh +++ b/benchmarks/dsr1_fp4_b200_docker.sh @@ -6,6 +6,8 @@ nvidia-smi # happens 1% of the time. ref: https://github.com/flashinfer-ai/flashinfer/pull/1779 sed -i '102,108d' /usr/local/lib/python3.12/dist-packages/flashinfer/jit/cubin_loader.py +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) + # Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls. if [[ $CONC -ge 16 ]]; then SCHEDULER_RECV_INTERVAL=30 @@ -22,5 +24,28 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0. --cuda-graph-max-bs 256 --max-running-requests 256 --mem-fraction-static 0.85 --kv-cache-dtype fp8_e4m3 \ --chunked-prefill-size 16384 \ --ep-size $EP_SIZE --quantization modelopt_fp4 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ ---enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 +--enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 > $SERVER_LOG 2>&1 & + +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! +set +x +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID + +pip install -q datasets pandas +set -x +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ +--model $MODEL --backend vllm --base-url http://localhost:$PORT \ +--dataset-name random \ +--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ +--num-prompts $(( $CONC * 10 )) \ +--max-concurrency $CONC \ +--request-rate inf --ignore-eos \ +--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ +--result-dir /workspace/ --result-filename $RESULT_FILENAME.json diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh index 6f4f814a0..6896880fb 100644 --- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh @@ -101,17 +101,19 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \ > $SERVER_LOG 2>&1 & +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID -git clone https://github.com/kimbochen/bench_serving.git set -x -python3 bench_serving/benchmark_serving.py \ +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model $MODEL --backend openai \ --base-url http://0.0.0.0:$PORT \ --dataset-name random \ diff --git a/benchmarks/dsr1_fp8_b200_docker.sh b/benchmarks/dsr1_fp8_b200_docker.sh index 361b6f1f6..babb5c9a6 100644 --- a/benchmarks/dsr1_fp8_b200_docker.sh +++ b/benchmarks/dsr1_fp8_b200_docker.sh @@ -17,6 +17,7 @@ sed -i '102,108d' /usr/local/lib/python3.12/dist-packages/flashinfer/jit/cubin_l export SGL_ENABLE_JIT_DEEPGEMM=false export SGLANG_ENABLE_FLASHINFER_GEMM=true +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) # Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls. if [[ $CONC -ge 16 ]]; then @@ -34,4 +35,27 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0. --cuda-graph-max-bs 128 --max-running-requests 128 \ --mem-fraction-static 0.82 --kv-cache-dtype fp8_e4m3 --chunked-prefill-size 32768 --max-prefill-tokens 32768 \ --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL --disable-radix-cache \ ---attention-backend trtllm_mla --stream-interval 30 --ep-size $EP_SIZE --moe-runner-backend flashinfer_trtllm --quantization fp8 +--attention-backend trtllm_mla --stream-interval 30 --ep-size $EP_SIZE --moe-runner-backend flashinfer_trtllm --quantization fp8 > $SERVER_LOG 2>&1 & + +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! +set +x +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID + +pip install -q datasets pandas +set -x +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ +--model $MODEL --backend vllm --base-url http://localhost:$PORT \ +--dataset-name random \ +--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ +--num-prompts $(( $CONC * 10 )) \ +--max-concurrency $CONC \ +--request-rate inf --ignore-eos \ +--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ +--result-dir /workspace/ --result-filename $RESULT_FILENAME.json \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh index 58d4525f1..81fc4137b 100644 --- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh @@ -69,25 +69,28 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \ --tp_size=$TP --ep_size=$EP_SIZE \ --extra_llm_api_options=$EXTRA_CONFIG_FILE \ > $SERVER_LOG 2>&1 & + +SERVER_PID=$! - +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID -git clone https://github.com/kimbochen/bench_serving.git +pip install -q datasets pandas set -x -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend openai \ ---base-url http://0.0.0.0:$PORT \ +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ +--model $MODEL --backend vllm --base-url http://localhost:$PORT \ --dataset-name random \ --random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ +--num-prompts $(( $CONC * 10 )) \ +--max-concurrency $CONC \ --request-rate inf --ignore-eos \ --save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json +--result-dir /workspace/ --result-filename $RESULT_FILENAME.json diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh index 28f3d29cf..f28f525c0 100644 --- a/benchmarks/gptoss_fp4_b200_docker.sh +++ b/benchmarks/gptoss_fp4_b200_docker.sh @@ -50,6 +50,8 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \ --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC \ --disable-log-requests > $SERVER_LOG 2>&1 & +SERVER_PID=$! + # Show logs until server is ready tail -f $SERVER_LOG & TAIL_PID=$! From 92af70bc77fbdfa00763ab51d493e559ac3d3e78 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 17:22:18 -0600 Subject: [PATCH 024/214] reverting erroneous change --- benchmarks/dsr1_fp8_b200_trt_slurm.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh index 81fc4137b..741ecdb92 100644 --- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh @@ -69,7 +69,7 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \ --tp_size=$TP --ep_size=$EP_SIZE \ --extra_llm_api_options=$EXTRA_CONFIG_FILE \ > $SERVER_LOG 2>&1 & - + SERVER_PID=$! # Show logs until server is ready @@ -81,16 +81,16 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID -pip install -q datasets pandas set -x BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model $MODEL --backend vllm --base-url http://localhost:$PORT \ +--model $MODEL --backend openai \ +--base-url http://0.0.0.0:$PORT \ --dataset-name random \ --random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) \ ---max-concurrency $CONC \ +--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ --request-rate inf --ignore-eos \ --save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ --result-filename $RESULT_FILENAME.json +--result-dir /workspace/ \ +--result-filename $RESULT_FILENAME.json From f330d672617cd1176b0eb771b5525617858444d7 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 18:00:48 -0600 Subject: [PATCH 025/214] fixing b200 --- benchmarks/dsr1_fp4_b200_docker.sh | 2 +- benchmarks/dsr1_fp8_b200_docker.sh | 2 +- benchmarks/gptoss_fp4_b200_docker.sh | 2 +- runners/launch_b200-nvd.sh | 44 +++++----------------------- 4 files changed, 11 insertions(+), 39 deletions(-) diff --git a/benchmarks/dsr1_fp4_b200_docker.sh b/benchmarks/dsr1_fp4_b200_docker.sh index 6b2112478..8b9f116c6 100644 --- a/benchmarks/dsr1_fp4_b200_docker.sh +++ b/benchmarks/dsr1_fp4_b200_docker.sh @@ -43,7 +43,7 @@ python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model $MODEL --backend vllm --base-url http://localhost:$PORT \ --dataset-name random \ --random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) \ +--num-prompts $NUM_PROMPTS \ --max-concurrency $CONC \ --request-rate inf --ignore-eos \ --save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ diff --git a/benchmarks/dsr1_fp8_b200_docker.sh b/benchmarks/dsr1_fp8_b200_docker.sh index babb5c9a6..f1412264c 100644 --- a/benchmarks/dsr1_fp8_b200_docker.sh +++ b/benchmarks/dsr1_fp8_b200_docker.sh @@ -54,7 +54,7 @@ python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model $MODEL --backend vllm --base-url http://localhost:$PORT \ --dataset-name random \ --random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) \ +--num-prompts $NUM_PROMPTS \ --max-concurrency $CONC \ --request-rate inf --ignore-eos \ --save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh index f28f525c0..530e61373 100644 --- a/benchmarks/gptoss_fp4_b200_docker.sh +++ b/benchmarks/gptoss_fp4_b200_docker.sh @@ -69,7 +69,7 @@ python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model $MODEL --backend vllm --base-url http://localhost:$PORT \ --dataset-name random \ --random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) \ +--num-prompts $NUM_PROMPTS \ --max-concurrency $CONC \ --request-rate inf --ignore-eos \ --save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ diff --git a/runners/launch_b200-nvd.sh b/runners/launch_b200-nvd.sh index 21a10d48f..a2587b477 100644 --- a/runners/launch_b200-nvd.sh +++ b/runners/launch_b200-nvd.sh @@ -25,29 +25,6 @@ set -x # Disabling it can reduce perf but will improve CI stability. i.e. we won't see vLLM/Sglang crashes. # Ref: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-graph-register - -docker run --rm -d --init --network host --name $server_name \ ---runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ --v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \ --e NCCL_GRAPH_REGISTER=0 \ --e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ ---entrypoint=/bin/bash \ -$(echo "$IMAGE" | sed 's/#/\//') \ -benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh" - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" =~ Application\ startup\ complete ]]; then - break - fi -done < <(docker logs -f --tail=0 $server_name 2>&1) - -git clone https://github.com/kimbochen/bench_serving.git - - if [[ "$MODEL" == "nvidia/DeepSeek-R1-0528-FP4" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then if [[ "$OSL" == "8192" ]]; then NUM_PROMPTS=$(( CONC * 20 )) @@ -58,22 +35,17 @@ else NUM_PROMPTS=$(( CONC * 10 )) fi -set -x -docker run --rm --network host --name $client_name \ +docker run --rm --init --network host --name $server_name \ +--runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ +-v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ +-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \ +-e NCCL_GRAPH_REGISTER=0 \ +-e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ +-e -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e NUM_PROMPTS \ --entrypoint=/bin/bash \ $(echo "$IMAGE" | sed 's/#/\//') \ --lc "pip install -q datasets pandas && \ -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend vllm --base-url http://localhost:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $NUM_PROMPTS \ ---max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ --result-filename $RESULT_FILENAME.json" +benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh" # Try graceful first docker stop -t 90 "$server_name" || true From c5fcf816fc85d099bd1bce0d5597f065892d6eed Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 18:02:36 -0600 Subject: [PATCH 026/214] fixing b200 pt 2 --- runners/launch_b200-nvd.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_b200-nvd.sh b/runners/launch_b200-nvd.sh index a2587b477..47c7c979f 100644 --- a/runners/launch_b200-nvd.sh +++ b/runners/launch_b200-nvd.sh @@ -42,7 +42,7 @@ docker run --rm --init --network host --name $server_name \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \ -e NCCL_GRAPH_REGISTER=0 \ -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ --e -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e NUM_PROMPTS \ +-e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e NUM_PROMPTS \ --entrypoint=/bin/bash \ $(echo "$IMAGE" | sed 's/#/\//') \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh" From 3ededf0a1ec9dab24326784b18107d2abb0da88c Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 18:16:35 -0600 Subject: [PATCH 027/214] updating mi300 --- benchmarks/dsr1_fp8_mi300x_docker.sh | 27 ++++++++++++++++- benchmarks/dsr1_fp8_mi300x_slurm.sh | 18 ++++++------ benchmarks/gptoss_fp4_b200_docker.sh | 3 +- benchmarks/gptoss_fp4_mi300x_docker.sh | 26 ++++++++++++++++- benchmarks/gptoss_fp4_mi300x_slurm.sh | 19 ++++++------ runners/launch_b200-nvd.sh | 6 ++-- runners/launch_mi300x-amd.sh | 40 ++------------------------ 7 files changed, 78 insertions(+), 61 deletions(-) diff --git a/benchmarks/dsr1_fp8_mi300x_docker.sh b/benchmarks/dsr1_fp8_mi300x_docker.sh index fca44bcf1..82cb4fbee 100644 --- a/benchmarks/dsr1_fp8_mi300x_docker.sh +++ b/benchmarks/dsr1_fp8_mi300x_docker.sh @@ -24,6 +24,8 @@ fi export SGLANG_USE_AITER=1 +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) + set -x python3 -m sglang.launch_server \ --model-path=$MODEL --host=0.0.0.0 --port=$PORT --trust-remote-code \ @@ -33,4 +35,27 @@ python3 -m sglang.launch_server \ --chunked-prefill-size=196608 \ --num-continuous-decode-steps=4 \ --max-prefill-tokens=196608 \ ---disable-radix-cache +--disable-radix-cache > $SERVER_LOG 2>&1 & + + +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! +set +x +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID + +set -x +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ +--model=$MODEL --backend=vllm --base-url=http://$server_name:$PORT \ +--dataset-name=random \ +--random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ +--num-prompts=$(( $CONC * 10 )) \ +--max-concurrency=$CONC \ +--request-rate=inf --ignore-eos \ +--save-result --percentile-metrics="ttft,tpot,itl,e2el" \ +--result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json diff --git a/benchmarks/dsr1_fp8_mi300x_slurm.sh b/benchmarks/dsr1_fp8_mi300x_slurm.sh index 90babeaee..31fe1bf55 100644 --- a/benchmarks/dsr1_fp8_mi300x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi300x_slurm.sh @@ -47,17 +47,19 @@ python3 -m sglang.launch_server \ --disable-radix-cache \ > $SERVER_LOG 2>&1 & +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"The server is fired up and ready to roll"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model=$MODEL --backend=vllm \ --base-url="http://0.0.0.0:$PORT" \ --dataset-name=random \ diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh index 530e61373..ac9aefefe 100644 --- a/benchmarks/gptoss_fp4_b200_docker.sh +++ b/benchmarks/gptoss_fp4_b200_docker.sh @@ -43,6 +43,7 @@ export TORCH_CUDA_ARCH_LIST="10.0" export VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB='{"2":32,"4":32,"8":8}' export PYTHONNOUSERSITE=1 export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1 + SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) set -x @@ -66,7 +67,7 @@ set -x BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model $MODEL --backend vllm --base-url http://localhost:$PORT \ +--model $MODEL --backend vllm --base-url http://localhost:$PORT \ --dataset-name random \ --random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ --num-prompts $NUM_PROMPTS \ diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh index 66a8642bd..32efdf0fe 100644 --- a/benchmarks/gptoss_fp4_mi300x_docker.sh +++ b/benchmarks/gptoss_fp4_mi300x_docker.sh @@ -24,6 +24,8 @@ export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) + set -x vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ @@ -34,4 +36,26 @@ vllm serve $MODEL --port $PORT \ --block-size=64 \ --no-enable-prefix-caching \ --disable-log-requests \ ---async-scheduling +--async-scheduling > $SERVER_LOG 2>&1 & + +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! +set +x +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID + +set -x +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ +--model=$MODEL --backend=vllm --base-url=http://$server_name:$PORT \ +--dataset-name=random \ +--random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ +--num-prompts=$(( $CONC * 10 )) \ +--max-concurrency=$CONC \ +--request-rate=inf --ignore-eos \ +--save-result --percentile-metrics="ttft,tpot,itl,e2el" \ +--result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json diff --git a/benchmarks/gptoss_fp4_mi300x_slurm.sh b/benchmarks/gptoss_fp4_mi300x_slurm.sh index 0ab5a250f..0e4a0b3b2 100644 --- a/benchmarks/gptoss_fp4_mi300x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi300x_slurm.sh @@ -48,17 +48,18 @@ vllm serve $MODEL --port $PORT \ --async-scheduling \ > $SERVER_LOG 2>&1 & +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model $MODEL --backend vllm \ --base-url http://0.0.0.0:$PORT \ --dataset-name random \ diff --git a/runners/launch_b200-nvd.sh b/runners/launch_b200-nvd.sh index 47c7c979f..c5216b006 100644 --- a/runners/launch_b200-nvd.sh +++ b/runners/launch_b200-nvd.sh @@ -27,12 +27,12 @@ set -x if [[ "$MODEL" == "nvidia/DeepSeek-R1-0528-FP4" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then if [[ "$OSL" == "8192" ]]; then - NUM_PROMPTS=$(( CONC * 20 )) + export NUM_PROMPTS=$(( CONC * 20 )) else - NUM_PROMPTS=$(( CONC * 50 )) + export NUM_PROMPTS=$(( CONC * 50 )) fi else - NUM_PROMPTS=$(( CONC * 10 )) + export NUM_PROMPTS=$(( CONC * 10 )) fi docker run --rm --init --network host --name $server_name \ diff --git a/runners/launch_mi300x-amd.sh b/runners/launch_mi300x-amd.sh index 51e059d4c..1f77a1ede 100644 --- a/runners/launch_mi300x-amd.sh +++ b/runners/launch_mi300x-amd.sh @@ -5,52 +5,16 @@ sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing' HF_HUB_CACHE_MOUNT="/shareddata/hf_hub_cache_$(hostname)/" PORT=8888 -network_name="bmk-net" server_name="bmk-server" -client_name="bmk-client" - -docker network create $network_name set -x -docker run --rm -d --ipc=host --shm-size=16g --network=$network_name --name=$server_name \ +docker run --rm --ipc=host --shm-size=16g --name=$server_name \ --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \ --e ISL -e OSL \ +-e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \ --entrypoint=/bin/bash \ $IMAGE \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi300x_docker.sh" - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" =~ Application\ startup\ complete ]]; then - break - fi -done < <(docker logs -f --tail=0 $server_name 2>&1) - -git clone https://github.com/kimbochen/bench_serving.git - -set -x -docker run --rm --network=$network_name --name=$client_name \ --v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ ---entrypoint=python3 \ -$IMAGE \ -bench_serving/benchmark_serving.py \ ---model=$MODEL --backend=vllm --base-url=http://$server_name:$PORT \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$(( $CONC * 10 )) \ ---max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics="ttft,tpot,itl,e2el" \ ---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json - -while [ -n "$(docker ps -aq)" ]; do - docker stop $server_name - docker network rm $network_name - sleep 5 -done From 813381b9616173df45f11f4235b774e586a53d75 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 18:22:42 -0600 Subject: [PATCH 028/214] updating mi300 pt 2 --- runners/launch_mi300x-amd.sh | 2 +- runners/launch_mi300x-cr.sh | 40 ++---------------------------------- runners/launch_mi300x-oci.sh | 40 ++---------------------------------- 3 files changed, 5 insertions(+), 77 deletions(-) diff --git a/runners/launch_mi300x-amd.sh b/runners/launch_mi300x-amd.sh index 1f77a1ede..780e5a2f0 100644 --- a/runners/launch_mi300x-amd.sh +++ b/runners/launch_mi300x-amd.sh @@ -8,7 +8,7 @@ PORT=8888 server_name="bmk-server" set -x -docker run --rm --ipc=host --shm-size=16g --name=$server_name \ +docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ diff --git a/runners/launch_mi300x-cr.sh b/runners/launch_mi300x-cr.sh index 48be17610..bdcc9e422 100644 --- a/runners/launch_mi300x-cr.sh +++ b/runners/launch_mi300x-cr.sh @@ -5,52 +5,16 @@ sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing' HF_HUB_CACHE_MOUNT="/mnt/vdb/gha_cache/hf_hub_cache/" PORT=8888 -network_name="bmk-net" server_name="bmk-server" -client_name="bmk-client" - -docker network create $network_name set -x -docker run --rm -d --ipc=host --shm-size=16g --network=$network_name --name=$server_name \ +docker run --rm -d --ipc=host --shm-size=16g --network=host --name=$server_name \ --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \ --e ISL -e OSL \ +-e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \ --entrypoint=/bin/bash \ $IMAGE \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi300x_docker.sh" - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" =~ Application\ startup\ complete ]]; then - break - fi -done < <(docker logs -f --tail=0 $server_name 2>&1) - -git clone https://github.com/kimbochen/bench_serving.git - -set -x -docker run --rm --network=$network_name --name=$client_name \ --v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ ---entrypoint=python3 \ -$IMAGE \ -bench_serving/benchmark_serving.py \ ---model=$MODEL --backend=vllm --base-url=http://$server_name:$PORT \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$(( $CONC * 10 )) \ ---max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics="ttft,tpot,itl,e2el" \ ---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json - -while [ -n "$(docker ps -aq)" ]; do - docker stop $server_name - docker network rm $network_name - sleep 5 -done diff --git a/runners/launch_mi300x-oci.sh b/runners/launch_mi300x-oci.sh index 60cf9c238..2018cbc94 100644 --- a/runners/launch_mi300x-oci.sh +++ b/runners/launch_mi300x-oci.sh @@ -3,52 +3,16 @@ HF_HUB_CACHE_MOUNT="$HOME/hf_hub_cache/" PORT=8888 -network_name="bmk-net" server_name="bmk-server" -client_name="bmk-client" - -docker network create $network_name set -x -docker run --rm -d --ipc=host --shm-size=16g --network=$network_name --name=$server_name \ +docker run --rm -d --ipc=host --shm-size=16g --network=host --name=$server_name \ --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \ --e ISL -e OSL \ +-e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \ --entrypoint=/bin/bash \ $IMAGE \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi300x_docker.sh" - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" =~ Application\ startup\ complete ]]; then - break - fi -done < <(docker logs -f --tail=0 $server_name 2>&1) - -git clone https://github.com/kimbochen/bench_serving.git - -set -x -docker run --rm --network=$network_name --name=$client_name \ --v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ ---entrypoint=python3 \ -$IMAGE \ -bench_serving/benchmark_serving.py \ ---model=$MODEL --backend=vllm --base-url=http://$server_name:$PORT \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$(( $CONC * 10 )) \ ---max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics="ttft,tpot,itl,e2el" \ ---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json - -while [ -n "$(docker ps -aq)" ]; do - docker stop $server_name - docker network rm $network_name - sleep 5 -done From e1b387c4f8aa8d92f554cde5a03ca1e6b8282693 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 18:24:03 -0600 Subject: [PATCH 029/214] updating mi300 pt 3 -- remove detached mode --- runners/launch_mi300x-cr.sh | 2 +- runners/launch_mi300x-oci.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/runners/launch_mi300x-cr.sh b/runners/launch_mi300x-cr.sh index bdcc9e422..8fbdaee63 100644 --- a/runners/launch_mi300x-cr.sh +++ b/runners/launch_mi300x-cr.sh @@ -8,7 +8,7 @@ PORT=8888 server_name="bmk-server" set -x -docker run --rm -d --ipc=host --shm-size=16g --network=host --name=$server_name \ +docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ diff --git a/runners/launch_mi300x-oci.sh b/runners/launch_mi300x-oci.sh index 2018cbc94..33614a03c 100644 --- a/runners/launch_mi300x-oci.sh +++ b/runners/launch_mi300x-oci.sh @@ -6,7 +6,7 @@ PORT=8888 server_name="bmk-server" set -x -docker run --rm -d --ipc=host --shm-size=16g --network=host --name=$server_name \ +docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ From c0a5c62b51a58eeea87c8ba59b23e69c7bb31611 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 18:27:48 -0600 Subject: [PATCH 030/214] cleaning up mi355x --- benchmarks/gptoss_fp4_mi355x_docker.sh | 14 +++++--- runners/launch_mi355x-amd.sh | 47 -------------------------- 2 files changed, 9 insertions(+), 52 deletions(-) diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh index 533f5e212..8209857bd 100644 --- a/benchmarks/gptoss_fp4_mi355x_docker.sh +++ b/benchmarks/gptoss_fp4_mi355x_docker.sh @@ -22,6 +22,8 @@ export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_FUSED_MOE_A16W4=1 +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) + set -x vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ @@ -32,18 +34,20 @@ vllm serve $MODEL --port $PORT \ --block-size=64 \ --no-enable-prefix-caching \ --disable-log-requests \ ---async-scheduling | tee $(mktemp /tmp/server-XXXXXX.log) & +--async-scheduling > $SERVER_LOG 2>&1 & -# Show server logs til' it is up, then stop showing +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! set +x -until curl --output /dev/null --silent --fail http://localhost:$PORT/health; do +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do sleep 5 done -pkill -P $$ tee 2>/dev/null +kill $TAIL_PID +set -x BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -set -x python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model=$MODEL --backend=vllm --base-url="http://localhost:$PORT" \ --dataset-name=random \ diff --git a/runners/launch_mi355x-amd.sh b/runners/launch_mi355x-amd.sh index 009a53108..e77daf5c2 100644 --- a/runners/launch_mi355x-amd.sh +++ b/runners/launch_mi355x-amd.sh @@ -17,11 +17,7 @@ HF_HUB_CACHE_MOUNT="/nfsdata/hf_hub_cache-1/" # Temp solution PORT=8888 -# network_name="bmk-net" server_name="bmk-server" -# client_name="bmk-client" - -# docker network create $network_name set -x docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ @@ -35,50 +31,7 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ $IMAGE \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi355x_docker.sh" -# set +x -# while IFS= read -r line; do -# printf '%s\n' "$line" -# if [[ "$line" =~ Application\ startup\ complete ]]; then -# break -# fi -# done < <(docker logs -f --tail=0 $server_name 2>&1) - -# if [[ "$MODEL" == "amd/DeepSeek-R1-0528-MXFP4-Preview" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then -# if [[ "$OSL" == "8192" ]]; then -# NUM_PROMPTS=$(( CONC * 20 )) -# else -# NUM_PROMPTS=$(( CONC * 50 )) -# fi -# else -# NUM_PROMPTS=$(( CONC * 10 )) -# fi - -# git clone https://github.com/kimbochen/bench_serving.git - -# set -x -# docker run --rm --network=$network_name --name=$client_name \ -# -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -# -e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ -# --entrypoint=python3 \ -# $IMAGE \ -# bench_serving/benchmark_serving.py \ -# --model=$MODEL --backend=vllm --base-url="http://$server_name:$PORT" \ -# --dataset-name=random \ -# --random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ -# --num-prompts=$NUM_PROMPTS \ -# --max-concurrency=$CONC \ -# --request-rate=inf --ignore-eos \ -# --save-result --percentile-metrics="ttft,tpot,itl,e2el" \ -# --result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json - if ls gpucore.* 1> /dev/null 2>&1; then echo "gpucore files exist. not good" rm -f gpucore.* fi - - -# while [ -n "$(docker ps -aq)" ]; do -# docker stop $server_name -# # docker network rm $network_name -# sleep 5 -# done From 634768cd538b1f55899d714ec5f1cf670ff74d4a Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 18:33:16 -0600 Subject: [PATCH 031/214] fixing mi300x and updating 325x --- benchmarks/dsr1_fp8_mi325x_docker.sh | 26 ++++++++++++++++- benchmarks/dsr1_fp8_mi325x_slurm.sh | 18 ++++++------ benchmarks/gptoss_fp4_mi300x_docker.sh | 4 +-- benchmarks/gptoss_fp4_mi325x_docker.sh | 26 ++++++++++++++++- benchmarks/gptoss_fp4_mi325x_slurm.sh | 18 ++++++------ runners/launch_mi325x-amd.sh | 40 ++------------------------ 6 files changed, 74 insertions(+), 58 deletions(-) diff --git a/benchmarks/dsr1_fp8_mi325x_docker.sh b/benchmarks/dsr1_fp8_mi325x_docker.sh index f39a8dbbd..41f77ebd3 100644 --- a/benchmarks/dsr1_fp8_mi325x_docker.sh +++ b/benchmarks/dsr1_fp8_mi325x_docker.sh @@ -14,6 +14,8 @@ export SGLANG_USE_AITER=1 +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) + python3 -m sglang.launch_server \ --model-path $MODEL \ --host=0.0.0.0 \ @@ -24,5 +26,27 @@ python3 -m sglang.launch_server \ --mem-fraction-static 0.8 --disable-radix-cache \ --num-continuous-decode-steps 4 \ --max-prefill-tokens 196608 \ - --cuda-graph-max-bs 128 + --cuda-graph-max-bs 128 > $SERVER_LOG 2>&1 & + +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! +set +x +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID + +set -x +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ +--model=$MODEL --backend=vllm --base-url=http://0.0.0.0:$PORT \ +--dataset-name=random \ +--random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ +--num-prompts=$(( $CONC * 10 )) \ +--max-concurrency=$CONC \ +--request-rate=inf --ignore-eos \ +--save-result --percentile-metrics="ttft,tpot,itl,e2el" \ +--result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json diff --git a/benchmarks/dsr1_fp8_mi325x_slurm.sh b/benchmarks/dsr1_fp8_mi325x_slurm.sh index 09dae4dbb..f9da69095 100644 --- a/benchmarks/dsr1_fp8_mi325x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi325x_slurm.sh @@ -23,17 +23,19 @@ python3 -m sglang.launch_server \ --disable-radix-cache \ > $SERVER_LOG 2>&1 & +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"The server is fired up and ready to roll"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model $MODEL --backend vllm \ --base-url http://0.0.0.0:$PORT \ --dataset-name random \ diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh index 32efdf0fe..0b03900be 100644 --- a/benchmarks/gptoss_fp4_mi300x_docker.sh +++ b/benchmarks/gptoss_fp4_mi300x_docker.sh @@ -51,11 +51,11 @@ set -x BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model=$MODEL --backend=vllm --base-url=http://$server_name:$PORT \ +--model=$MODEL --backend=vllm --base-url=http://0.0.0.0:$PORT \ --dataset-name=random \ --random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ --num-prompts=$(( $CONC * 10 )) \ --max-concurrency=$CONC \ --request-rate=inf --ignore-eos \ --save-result --percentile-metrics="ttft,tpot,itl,e2el" \ ---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json +--result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_mi325x_docker.sh b/benchmarks/gptoss_fp4_mi325x_docker.sh index 05250267f..c57446da3 100644 --- a/benchmarks/gptoss_fp4_mi325x_docker.sh +++ b/benchmarks/gptoss_fp4_mi325x_docker.sh @@ -23,6 +23,8 @@ export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) + set -x vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ @@ -33,4 +35,26 @@ vllm serve $MODEL --port $PORT \ --block-size=64 \ --no-enable-prefix-caching \ --disable-log-requests \ ---async-scheduling +--async-scheduling > $SERVER_LOG 2>&1 & + +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! +set +x +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID + +set -x +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ +--model=$MODEL --backend=vllm --base-url=http://0.0.0.0:$PORT \ +--dataset-name=random \ +--random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ +--num-prompts=$(( $CONC * 10 )) \ +--max-concurrency=$CONC \ +--request-rate=inf --ignore-eos \ +--save-result --percentile-metrics="ttft,tpot,itl,e2el" \ +--result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh index cab549cbc..9cbef3276 100644 --- a/benchmarks/gptoss_fp4_mi325x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh @@ -48,17 +48,19 @@ vllm serve $MODEL --port $PORT \ --async-scheduling \ > $SERVER_LOG 2>&1 & +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model $MODEL --backend vllm \ --base-url http://0.0.0.0:$PORT \ --dataset-name random \ diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh index 91b9bfad3..4dd66bc17 100644 --- a/runners/launch_mi325x-amd.sh +++ b/runners/launch_mi325x-amd.sh @@ -5,52 +5,16 @@ sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing' HF_HUB_CACHE_MOUNT="/home/kimbosemianalysis/hf_hub_cache/" PORT=8888 -network_name="bmk-net" server_name="bmk-server" -client_name="bmk-client" - -docker network create $network_name set -x -docker run --rm -d --ipc=host --shm-size=16g --network=$network_name --name=$server_name \ +docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \ --e ISL -e OSL \ +-e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \ --entrypoint=/bin/bash \ $IMAGE \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi325x_docker.sh" - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" =~ Application\ startup\ complete ]]; then - break - fi -done < <(docker logs -f --tail=0 $server_name 2>&1) - -git clone https://github.com/kimbochen/bench_serving.git - -set -x -docker run --rm --network=$network_name --name=$client_name \ --v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ ---entrypoint=python3 \ -$IMAGE \ -bench_serving/benchmark_serving.py \ ---model=$MODEL --backend=vllm --base-url=http://$server_name:$PORT \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$(( $CONC * 10 )) \ ---max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics="ttft,tpot,itl,e2el" \ ---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json - -while [ -n "$(docker ps -aq)" ]; do - docker stop $server_name - docker network rm $network_name - sleep 5 -done From 61a5c8f1415d957ab0251c23e90f1b572da05408 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 18:38:43 -0600 Subject: [PATCH 032/214] reverting max conc to 512 on gptoss fp4 b200 docker --- benchmarks/gptoss_fp4_h100_docker.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index ae889474c..2cec8a165 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -51,7 +51,7 @@ python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --base-url=http://localhost:$PORT \ --dataset-name=random \ --random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \ +--num-prompts=$(( $CONC * 10 )) --max-concurrency=512 \ --request-rate=inf --ignore-eos \ --save-result --percentile-metrics='ttft,tpot,itl,e2el' \ --result-dir=/workspace/ \ From 74363e41efe22dc29389fc7952e75eea9fe5b5ed Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 18:43:58 -0600 Subject: [PATCH 033/214] mi325x debug --- runners/launch_mi325x-amd.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh index 4dd66bc17..481603aa8 100644 --- a/runners/launch_mi325x-amd.sh +++ b/runners/launch_mi325x-amd.sh @@ -1,5 +1,7 @@ #!/usr/bin/bash +set -x + sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing' HF_HUB_CACHE_MOUNT="/home/kimbosemianalysis/hf_hub_cache/" @@ -17,4 +19,4 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ -e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \ --entrypoint=/bin/bash \ $IMAGE \ -benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi325x_docker.sh" +benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi325x_docker.sh" \ No newline at end of file From 220e0261a4f984dff0ab543b79976a7981a9f47e Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 13 Nov 2025 19:05:38 -0600 Subject: [PATCH 034/214] add back correct launch script for new mi325x slurm cluster (#231) --- runners/launch_mi325x-amd.sh | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh index 481603aa8..b622ee2e8 100644 --- a/runners/launch_mi325x-amd.sh +++ b/runners/launch_mi325x-amd.sh @@ -1,22 +1,24 @@ -#!/usr/bin/bash +#!/usr/bin/env bash -set -x +export HF_HUB_CACHE_MOUNT="/nfsdata/sa/hf_hub_cache-${USER: -1}/" +export PORT_OFFSET=${USER: -1} -sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing' +PARTITION="compute" +SQUASH_FILE="/nfsdata/sa/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" -HF_HUB_CACHE_MOUNT="/home/kimbosemianalysis/hf_hub_cache/" -PORT=8888 +set -x +salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=256 --time=180 --no-shell +JOB_ID=$(squeue -u $USER -h -o %A | head -n1) -server_name="bmk-server" +srun --jobid=$JOB_ID bash -c "sudo enroot import -o $SQUASH_FILE docker://$IMAGE" +srun --jobid=$JOB_ID \ +--container-image=$SQUASH_FILE \ +--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ +--container-mount-home \ +--container-writable \ +--container-remap-root \ +--container-workdir=/workspace/ \ +--no-container-entrypoint --export=ALL \ +bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_mi325x_slurm.sh -set -x -docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ ---privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \ ---cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ --v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \ --e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \ ---entrypoint=/bin/bash \ -$IMAGE \ -benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi325x_docker.sh" \ No newline at end of file +scancel $JOB_ID \ No newline at end of file From 5db1af845ac396a15de9faefd11794d1165330db Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 13 Nov 2025 18:33:16 -0600 Subject: [PATCH 035/214] fixing mi300x and updating 325x --- runners/launch_mi325x-amd.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh index b622ee2e8..1065167d7 100644 --- a/runners/launch_mi325x-amd.sh +++ b/runners/launch_mi325x-amd.sh @@ -21,4 +21,4 @@ srun --jobid=$JOB_ID \ --no-container-entrypoint --export=ALL \ bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_mi325x_slurm.sh -scancel $JOB_ID \ No newline at end of file +scancel $JOB_ID From b4eb57ee9d424141c8557d0494116de6ae6027da Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 14 Nov 2025 09:04:15 -0600 Subject: [PATCH 036/214] cleanng up --- benchmarks/dsr1_fp4_mi355x_docker.sh | 27 ++++++++++++++++++++++++- benchmarks/dsr1_fp4_mi355x_slurm.sh | 13 +++--------- benchmarks/dsr1_fp8_h200_slurm.sh | 13 +++--------- benchmarks/dsr1_fp8_mi355x_docker.sh | 13 ++++++++---- benchmarks/dsr1_fp8_mi355x_slurm.sh | 18 +++++++++-------- benchmarks/gptoss_fp4_b200_trt_slurm.sh | 20 +++++++++--------- benchmarks/gptoss_fp4_h200_slurm.sh | 18 +++++++++-------- 7 files changed, 72 insertions(+), 50 deletions(-) diff --git a/benchmarks/dsr1_fp4_mi355x_docker.sh b/benchmarks/dsr1_fp4_mi355x_docker.sh index 4d3ed084c..72c4e4778 100644 --- a/benchmarks/dsr1_fp4_mi355x_docker.sh +++ b/benchmarks/dsr1_fp4_mi355x_docker.sh @@ -18,6 +18,8 @@ if [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then fi fi +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) + set -x python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \ --host=0.0.0.0 --port=$PORT \ @@ -27,5 +29,28 @@ python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \ --disable-radix-cache \ --num-continuous-decode-steps=4 \ --max-prefill-tokens=$PREFILL_SIZE \ ---cuda-graph-max-bs=128 +--cuda-graph-max-bs=128 > $SERVER_LOG 2>&1 & + +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! +set +x +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID + +set -x +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ +--model=$MODEL --backend=vllm --base-url="http://localhost:$PORT" \ +--dataset-name=random \ +--random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ +--num-prompts=$NUM_PROMPTS \ +--max-concurrency=$CONC \ +--request-rate=inf --ignore-eos \ +--save-result --percentile-metrics="ttft,tpot,itl,e2el" \ +--result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json + diff --git a/benchmarks/dsr1_fp4_mi355x_slurm.sh b/benchmarks/dsr1_fp4_mi355x_slurm.sh index b88a90f46..ffd2883fd 100644 --- a/benchmarks/dsr1_fp4_mi355x_slurm.sh +++ b/benchmarks/dsr1_fp4_mi355x_slurm.sh @@ -34,17 +34,10 @@ python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \ --cuda-graph-max-bs=128 \ > $SERVER_LOG 2>&1 & -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"The server is fired up and ready to roll"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model $MODEL --backend vllm \ --base-url "http://0.0.0.0:$PORT" \ --dataset-name random \ diff --git a/benchmarks/dsr1_fp8_h200_slurm.sh b/benchmarks/dsr1_fp8_h200_slurm.sh index 86ea0024f..2298b5486 100644 --- a/benchmarks/dsr1_fp8_h200_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_slurm.sh @@ -44,17 +44,10 @@ else > $SERVER_LOG 2>&1 & fi -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model $MODEL --backend vllm \ --base-url http://0.0.0.0:$PORT \ --dataset-name random \ diff --git a/benchmarks/dsr1_fp8_mi355x_docker.sh b/benchmarks/dsr1_fp8_mi355x_docker.sh index baad70fd8..2ee734495 100644 --- a/benchmarks/dsr1_fp8_mi355x_docker.sh +++ b/benchmarks/dsr1_fp8_mi355x_docker.sh @@ -14,6 +14,8 @@ export SGLANG_USE_AITER=1 +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) + python3 -m sglang.launch_server \ --model-path $MODEL \ --host=0.0.0.0 \ @@ -24,13 +26,16 @@ python3 -m sglang.launch_server \ --mem-fraction-static 0.8 --disable-radix-cache \ --num-continuous-decode-steps 4 \ --max-prefill-tokens 196608 \ - --cuda-graph-max-bs 128 | tee $(mktemp /tmp/server-XXXXXX.log) & + --cuda-graph-max-bs 128 > $SERVER_LOG 2>&1 & +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! set +x -until curl --output /dev/null --silent --fail http://localhost:$PORT/health; do +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do sleep 5 done -pkill -P $$ tee 2>/dev/null +kill $TAIL_PID if [[ "$MODEL" == "amd/DeepSeek-R1-0528-MXFP4-Preview" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then if [[ "$OSL" == "8192" ]]; then @@ -42,9 +47,9 @@ else NUM_PROMPTS=$(( CONC * 10 )) fi +set -x BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -set -x python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model=$MODEL --backend=vllm --base-url="http://localhost:$PORT" \ --dataset-name=random \ diff --git a/benchmarks/dsr1_fp8_mi355x_slurm.sh b/benchmarks/dsr1_fp8_mi355x_slurm.sh index bf5d60e9c..0bdc36024 100644 --- a/benchmarks/dsr1_fp8_mi355x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi355x_slurm.sh @@ -32,17 +32,19 @@ python3 -m sglang.launch_server \ --max-prefill-tokens 196608 \ --cuda-graph-max-bs 128 > $SERVER_LOG 2>&1 & +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"The server is fired up and ready to roll"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model $MODEL --backend vllm \ --base-url "http://0.0.0.0:$PORT" \ --dataset-name random \ diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh index 349930dfb..92477dd56 100644 --- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh @@ -79,17 +79,19 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \ > $SERVER_LOG 2>&1 & +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - -git clone https://github.com/kimbochen/bench_serving.git +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID + set -x -python3 bench_serving/benchmark_serving.py \ +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model $MODEL --backend openai \ --base-url http://0.0.0.0:$PORT \ --dataset-name random \ diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh index f92c60425..37851d39c 100644 --- a/benchmarks/gptoss_fp4_h200_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_slurm.sh @@ -48,17 +48,19 @@ PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC \ --disable-log-requests > $SERVER_LOG 2>&1 & +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ +BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) +git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR +python3 $BENCH_SERVING_DIR/benchmark_serving.py \ --model $MODEL --backend vllm \ --base-url http://0.0.0.0:$PORT \ --dataset-name random \ From 04e30f350bc5df5e7f556dea612ba3cd7e464df8 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 14 Nov 2025 09:09:20 -0600 Subject: [PATCH 037/214] add wait for h200 slurm dsr1 --- benchmarks/dsr1_fp8_h200_slurm.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/benchmarks/dsr1_fp8_h200_slurm.sh b/benchmarks/dsr1_fp8_h200_slurm.sh index 2298b5486..14e3c2a7b 100644 --- a/benchmarks/dsr1_fp8_h200_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_slurm.sh @@ -44,6 +44,15 @@ else > $SERVER_LOG 2>&1 & fi +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! +set +x +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID + set -x BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR From d36965abc3d33099c320f13fc39221326a13b961 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 14 Nov 2025 09:20:27 -0600 Subject: [PATCH 038/214] max num seqs back to 512 for gptoss fpr b200 docker --- benchmarks/gptoss_fp4_b200_docker.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh index ac9aefefe..878564e11 100644 --- a/benchmarks/gptoss_fp4_b200_docker.sh +++ b/benchmarks/gptoss_fp4_b200_docker.sh @@ -48,7 +48,7 @@ SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) set -x vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \ ---gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC \ +--gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs 512 \ --disable-log-requests > $SERVER_LOG 2>&1 & SERVER_PID=$! From fa7cbca344a604f6b578a918609582edfc29eb04 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 14 Nov 2025 09:40:10 -0600 Subject: [PATCH 039/214] fix port issue for dsr1 mi300x docker --- benchmarks/dsr1_fp8_mi300x_docker.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/dsr1_fp8_mi300x_docker.sh b/benchmarks/dsr1_fp8_mi300x_docker.sh index 82cb4fbee..033b84ed0 100644 --- a/benchmarks/dsr1_fp8_mi300x_docker.sh +++ b/benchmarks/dsr1_fp8_mi300x_docker.sh @@ -51,7 +51,7 @@ set -x BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model=$MODEL --backend=vllm --base-url=http://$server_name:$PORT \ +--model=$MODEL --backend=vllm --base-url=http://0.0.0.0:$PORT \ --dataset-name=random \ --random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ --num-prompts=$(( $CONC * 10 )) \ From 1031ac957ec607bb586bceda5179ab2145e6836f Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 14 Nov 2025 11:50:53 -0600 Subject: [PATCH 040/214] fix mi355x docker NUM_PROMPTS --- runners/launch_mi355x-amd.sh | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/runners/launch_mi355x-amd.sh b/runners/launch_mi355x-amd.sh index e77daf5c2..5f3cbb290 100644 --- a/runners/launch_mi355x-amd.sh +++ b/runners/launch_mi355x-amd.sh @@ -19,13 +19,23 @@ PORT=8888 server_name="bmk-server" +if [[ "$MODEL" == "amd/DeepSeek-R1-0528-MXFP4-Preview" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then + if [[ "$OSL" == "8192" ]]; then + export NUM_PROMPTS=$(( CONC * 20 )) + else + export NUM_PROMPTS=$(( CONC * 50 )) + fi +else + export NUM_PROMPTS=$(( CONC * 10 )) +fi + set -x docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \ +-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT -e NUM_PROMPTS \ -e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \ --entrypoint=/bin/bash \ $IMAGE \ From 8b847f13bfb1a76689460aadaeaac61b92539e47 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 14 Nov 2025 15:46:51 -0600 Subject: [PATCH 041/214] adding prop of failure for server logs --- benchmarks/dsr1_fp4_mi355x_slurm.sh | 9 +++++++++ benchmarks/dsr1_fp8_h200_trt_slurm.sh | 2 +- benchmarks/gptoss_fp4_h100_docker.sh | 6 ++++++ runners/launch_b200-nb.sh | 2 ++ 4 files changed, 18 insertions(+), 1 deletion(-) diff --git a/benchmarks/dsr1_fp4_mi355x_slurm.sh b/benchmarks/dsr1_fp4_mi355x_slurm.sh index ffd2883fd..cad5efdc5 100644 --- a/benchmarks/dsr1_fp4_mi355x_slurm.sh +++ b/benchmarks/dsr1_fp4_mi355x_slurm.sh @@ -34,6 +34,15 @@ python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \ --cuda-graph-max-bs=128 \ > $SERVER_LOG 2>&1 & +# Show logs until server is ready +tail -f $SERVER_LOG & +TAIL_PID=$! +set +x +until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + sleep 5 +done +kill $TAIL_PID + set -x BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh index a44769bc6..15647bbd2 100644 --- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh @@ -69,9 +69,9 @@ PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \ --tp_size=$TP --ep_size=$EP_SIZE \ --extra_llm_api_options=$EXTRA_CONFIG_FILE \ > $SERVER_LOG 2>&1 & + SERVER_PID=$! - # Show logs until server is ready tail -f $SERVER_LOG & TAIL_PID=$! diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index 2cec8a165..6b95fae1a 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -32,11 +32,17 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --max-num-seqs=$CONC \ --disable-log-requests > $SERVER_LOG 2>&1 & +SERVER_PID=$! + # Show logs until server is ready tail -f $SERVER_LOG & TAIL_PID=$! set +x until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do + if ! kill -0 $SERVER_PID 2>/dev/null; then + echo "Server died before becoming healthy. Exiting." + exit 1 + fi sleep 5 done kill $TAIL_PID diff --git a/runners/launch_b200-nb.sh b/runners/launch_b200-nb.sh index 9a3dfa909..ecd1466dd 100644 --- a/runners/launch_b200-nb.sh +++ b/runners/launch_b200-nb.sh @@ -13,3 +13,5 @@ srun --partition=$PARTITION --gres=gpu:$TP --exclusive \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT_OFFSET=${USER: -1} \ bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_slurm.sh + +scancel $JOB_ID \ No newline at end of file From 832bafce3354dce9703c6d74084baac18b4b29c3 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 14 Nov 2025 16:05:09 -0600 Subject: [PATCH 042/214] add utils function for benchmark --- benchmarks/benchmark_lib.sh | 143 ++++++++++++++++++++++++ benchmarks/dsr1_fp4_b200_docker.sh | 26 +++-- benchmarks/dsr1_fp4_b200_trt_slurm.sh | 26 +++-- benchmarks/dsr1_fp4_mi355x_docker.sh | 25 +++-- benchmarks/dsr1_fp4_mi355x_slurm.sh | 25 +++-- benchmarks/dsr1_fp8_b200_docker.sh | 26 +++-- benchmarks/dsr1_fp8_b200_trt_slurm.sh | 26 +++-- benchmarks/dsr1_fp8_h200_slurm.sh | 26 +++-- benchmarks/dsr1_fp8_h200_trt_slurm.sh | 26 +++-- benchmarks/dsr1_fp8_mi300x_docker.sh | 25 +++-- benchmarks/dsr1_fp8_mi300x_slurm.sh | 26 +++-- benchmarks/dsr1_fp8_mi325x_docker.sh | 25 +++-- benchmarks/dsr1_fp8_mi325x_slurm.sh | 26 +++-- benchmarks/dsr1_fp8_mi355x_docker.sh | 25 +++-- benchmarks/dsr1_fp8_mi355x_slurm.sh | 25 +++-- benchmarks/gptoss_fp4_b200_docker.sh | 26 +++-- benchmarks/gptoss_fp4_b200_trt_slurm.sh | 26 +++-- benchmarks/gptoss_fp4_h100_docker.sh | 28 ++--- benchmarks/gptoss_fp4_h100_slurm.sh | 28 ++--- benchmarks/gptoss_fp4_h200_slurm.sh | 26 +++-- benchmarks/gptoss_fp4_h200_trt_slurm.sh | 26 +++-- benchmarks/gptoss_fp4_mi300x_docker.sh | 25 +++-- benchmarks/gptoss_fp4_mi300x_slurm.sh | 26 +++-- benchmarks/gptoss_fp4_mi325x_docker.sh | 25 +++-- benchmarks/gptoss_fp4_mi325x_slurm.sh | 26 +++-- benchmarks/gptoss_fp4_mi355x_docker.sh | 25 +++-- benchmarks/gptoss_fp4_mi355x_slurm.sh | 25 +++-- 27 files changed, 512 insertions(+), 301 deletions(-) create mode 100644 benchmarks/benchmark_lib.sh diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh new file mode 100644 index 000000000..152b5e4b6 --- /dev/null +++ b/benchmarks/benchmark_lib.sh @@ -0,0 +1,143 @@ +#!/usr/bin/env bash + +# Shared benchmarking utilities for InferenceMAX + +# Run benchmark serving with standardized parameters +# All parameters are required +# Parameters: +# --model: Model name +# --port: Server port +# --backend: Backend type - 'vllm' or 'openai' +# --input-len: Random input sequence length +# --output-len: Random output sequence length +# --random-range-ratio: Random range ratio +# --num-prompts: Number of prompts +# --max-concurrency: Max concurrency +# --result-filename: Result filename without extension +# --result-dir: Result directory +run_benchmark_serving() { + local model="" + local port="" + local backend="" + local input_len="" + local output_len="" + local random_range_ratio="" + local num_prompts="" + local max_concurrency="" + local result_filename="" + local result_dir="" + + # Parse arguments + while [[ $# -gt 0 ]]; do + case $1 in + --model) + model="$2" + shift 2 + ;; + --port) + port="$2" + shift 2 + ;; + --backend) + backend="$2" + shift 2 + ;; + --input-len) + input_len="$2" + shift 2 + ;; + --output-len) + output_len="$2" + shift 2 + ;; + --random-range-ratio) + random_range_ratio="$2" + shift 2 + ;; + --num-prompts) + num_prompts="$2" + shift 2 + ;; + --max-concurrency) + max_concurrency="$2" + shift 2 + ;; + --result-filename) + result_filename="$2" + shift 2 + ;; + --result-dir) + result_dir="$2" + shift 2 + ;; + *) + echo "Unknown parameter: $1" + return 1 + ;; + esac + done + + # Validate all required parameters + if [[ -z "$model" ]]; then + echo "Error: --model is required" + return 1 + fi + if [[ -z "$port" ]]; then + echo "Error: --port is required" + return 1 + fi + if [[ -z "$backend" ]]; then + echo "Error: --backend is required" + return 1 + fi + if [[ -z "$input_len" ]]; then + echo "Error: --input-len is required" + return 1 + fi + if [[ -z "$output_len" ]]; then + echo "Error: --output-len is required" + return 1 + fi + if [[ -z "$random_range_ratio" ]]; then + echo "Error: --random-range-ratio is required" + return 1 + fi + if [[ -z "$num_prompts" ]]; then + echo "Error: --num-prompts is required" + return 1 + fi + if [[ -z "$max_concurrency" ]]; then + echo "Error: --max-concurrency is required" + return 1 + fi + if [[ -z "$result_filename" ]]; then + echo "Error: --result-filename is required" + return 1 + fi + if [[ -z "$result_dir" ]]; then + echo "Error: --result-dir is required" + return 1 + fi + + # Clone benchmark serving repo + local BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) + git clone https://github.com/kimbochen/bench_serving.git "$BENCH_SERVING_DIR" + + # Run benchmark + python3 "$BENCH_SERVING_DIR/benchmark_serving.py" \ + --model "$model" \ + --backend "$backend" \ + --base-url "http://0.0.0.0:$port" \ + --dataset-name random \ + --random-input-len "$input_len" \ + --random-output-len "$output_len" \ + --random-range-ratio "$random_range_ratio" \ + --num-prompts "$num_prompts" \ + --max-concurrency "$max_concurrency" \ + --request-rate inf \ + --ignore-eos \ + --save-result \ + --percentile-metrics 'ttft,tpot,itl,e2el' \ + --result-dir "$result_dir" \ + --result-filename "$result_filename.json" +} diff --git a/benchmarks/dsr1_fp4_b200_docker.sh b/benchmarks/dsr1_fp4_b200_docker.sh index 8b9f116c6..317015ba8 100644 --- a/benchmarks/dsr1_fp4_b200_docker.sh +++ b/benchmarks/dsr1_fp4_b200_docker.sh @@ -36,16 +36,20 @@ done kill $TAIL_PID pip install -q datasets pandas + +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model $MODEL --backend vllm --base-url http://localhost:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $NUM_PROMPTS \ ---max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ --result-filename $RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$NUM_PROMPTS" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh index 6896880fb..897ef8527 100644 --- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh @@ -110,16 +110,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model $MODEL --backend openai \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend openai \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp4_mi355x_docker.sh b/benchmarks/dsr1_fp4_mi355x_docker.sh index 72c4e4778..05603ecae 100644 --- a/benchmarks/dsr1_fp4_mi355x_docker.sh +++ b/benchmarks/dsr1_fp4_mi355x_docker.sh @@ -40,17 +40,20 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model=$MODEL --backend=vllm --base-url="http://localhost:$PORT" \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$NUM_PROMPTS \ ---max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics="ttft,tpot,itl,e2el" \ ---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$NUM_PROMPTS" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp4_mi355x_slurm.sh b/benchmarks/dsr1_fp4_mi355x_slurm.sh index cad5efdc5..c47bbfb38 100644 --- a/benchmarks/dsr1_fp4_mi355x_slurm.sh +++ b/benchmarks/dsr1_fp4_mi355x_slurm.sh @@ -43,16 +43,19 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url "http://0.0.0.0:$PORT" \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics "ttft,tpot,itl,e2el" \ ---result-dir /workspace/ --result-filename $RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp8_b200_docker.sh b/benchmarks/dsr1_fp8_b200_docker.sh index f1412264c..fa498ff3e 100644 --- a/benchmarks/dsr1_fp8_b200_docker.sh +++ b/benchmarks/dsr1_fp8_b200_docker.sh @@ -47,15 +47,19 @@ done kill $TAIL_PID pip install -q datasets pandas + +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model $MODEL --backend vllm --base-url http://localhost:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $NUM_PROMPTS \ ---max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ --result-filename $RESULT_FILENAME.json \ No newline at end of file +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$NUM_PROMPTS" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh index 741ecdb92..a22536d82 100644 --- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh @@ -81,16 +81,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model $MODEL --backend openai \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend openai \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp8_h200_slurm.sh b/benchmarks/dsr1_fp8_h200_slurm.sh index 14e3c2a7b..7444f763f 100644 --- a/benchmarks/dsr1_fp8_h200_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_slurm.sh @@ -53,16 +53,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh index 15647bbd2..94baa7850 100644 --- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh @@ -85,16 +85,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model $MODEL --backend openai \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend openai \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp8_mi300x_docker.sh b/benchmarks/dsr1_fp8_mi300x_docker.sh index 033b84ed0..5ffebb941 100644 --- a/benchmarks/dsr1_fp8_mi300x_docker.sh +++ b/benchmarks/dsr1_fp8_mi300x_docker.sh @@ -47,15 +47,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model=$MODEL --backend=vllm --base-url=http://0.0.0.0:$PORT \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$(( $CONC * 10 )) \ ---max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics="ttft,tpot,itl,e2el" \ ---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp8_mi300x_slurm.sh b/benchmarks/dsr1_fp8_mi300x_slurm.sh index 31fe1bf55..ba1597982 100644 --- a/benchmarks/dsr1_fp8_mi300x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi300x_slurm.sh @@ -56,16 +56,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model=$MODEL --backend=vllm \ ---base-url="http://0.0.0.0:$PORT" \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics='ttft,tpot,itl,e2el' \ ---result-dir=/workspace/ \ ---result-filename=$RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp8_mi325x_docker.sh b/benchmarks/dsr1_fp8_mi325x_docker.sh index 41f77ebd3..c0f95846d 100644 --- a/benchmarks/dsr1_fp8_mi325x_docker.sh +++ b/benchmarks/dsr1_fp8_mi325x_docker.sh @@ -37,16 +37,19 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model=$MODEL --backend=vllm --base-url=http://0.0.0.0:$PORT \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$(( $CONC * 10 )) \ ---max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics="ttft,tpot,itl,e2el" \ ---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp8_mi325x_slurm.sh b/benchmarks/dsr1_fp8_mi325x_slurm.sh index f9da69095..1ccec681f 100644 --- a/benchmarks/dsr1_fp8_mi325x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi325x_slurm.sh @@ -32,16 +32,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp8_mi355x_docker.sh b/benchmarks/dsr1_fp8_mi355x_docker.sh index 2ee734495..50d9bb02d 100644 --- a/benchmarks/dsr1_fp8_mi355x_docker.sh +++ b/benchmarks/dsr1_fp8_mi355x_docker.sh @@ -47,18 +47,21 @@ else NUM_PROMPTS=$(( CONC * 10 )) fi +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model=$MODEL --backend=vllm --base-url="http://localhost:$PORT" \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$NUM_PROMPTS \ ---max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics="ttft,tpot,itl,e2el" \ ---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$NUM_PROMPTS" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp8_mi355x_slurm.sh b/benchmarks/dsr1_fp8_mi355x_slurm.sh index 0bdc36024..86b5f9649 100644 --- a/benchmarks/dsr1_fp8_mi355x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi355x_slurm.sh @@ -41,15 +41,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url "http://0.0.0.0:$PORT" \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics "ttft,tpot,itl,e2el" \ ---result-dir /workspace/ --result-filename $RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh index 878564e11..208ea278d 100644 --- a/benchmarks/gptoss_fp4_b200_docker.sh +++ b/benchmarks/gptoss_fp4_b200_docker.sh @@ -63,15 +63,19 @@ done kill $TAIL_PID pip install -q datasets pandas + +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model $MODEL --backend vllm --base-url http://localhost:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $NUM_PROMPTS \ ---max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ --result-filename $RESULT_FILENAME.json \ No newline at end of file +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$NUM_PROMPTS" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh index 92477dd56..4647cb346 100644 --- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh @@ -88,16 +88,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model $MODEL --backend openai \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend openai \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index 6b95fae1a..42bbf6b1a 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -48,17 +48,19 @@ done kill $TAIL_PID pip install -q datasets pandas + +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model=$MODEL \ ---backend=vllm \ ---base-url=http://localhost:$PORT \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$(( $CONC * 10 )) --max-concurrency=512 \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics='ttft,tpot,itl,e2el' \ ---result-dir=/workspace/ \ ---result-filename=$RESULT_FILENAME.json \ No newline at end of file +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency 512 \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh index d82bebf72..5f31f0abf 100644 --- a/benchmarks/gptoss_fp4_h100_slurm.sh +++ b/benchmarks/gptoss_fp4_h100_slurm.sh @@ -45,17 +45,19 @@ done kill $TAIL_PID pip install -q datasets pandas + +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model=$MODEL \ ---backend=vllm \ ---base-url="http://0.0.0.0:$PORT" \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics='ttft,tpot,itl,e2el' \ ---result-dir=/workspace/ \ ---result-filename=$RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh index 37851d39c..146ab16a5 100644 --- a/benchmarks/gptoss_fp4_h200_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_slurm.sh @@ -57,16 +57,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh index 0927a0d61..ffe6e65de 100644 --- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh @@ -68,16 +68,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model $MODEL --backend openai \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend openai \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh index 0b03900be..a3b1dc4f3 100644 --- a/benchmarks/gptoss_fp4_mi300x_docker.sh +++ b/benchmarks/gptoss_fp4_mi300x_docker.sh @@ -47,15 +47,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model=$MODEL --backend=vllm --base-url=http://0.0.0.0:$PORT \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$(( $CONC * 10 )) \ ---max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics="ttft,tpot,itl,e2el" \ ---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json \ No newline at end of file +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_mi300x_slurm.sh b/benchmarks/gptoss_fp4_mi300x_slurm.sh index 0e4a0b3b2..053c79197 100644 --- a/benchmarks/gptoss_fp4_mi300x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi300x_slurm.sh @@ -57,15 +57,17 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/gptoss_fp4_mi325x_docker.sh b/benchmarks/gptoss_fp4_mi325x_docker.sh index c57446da3..62d2c5bd0 100644 --- a/benchmarks/gptoss_fp4_mi325x_docker.sh +++ b/benchmarks/gptoss_fp4_mi325x_docker.sh @@ -46,15 +46,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model=$MODEL --backend=vllm --base-url=http://0.0.0.0:$PORT \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$(( $CONC * 10 )) \ ---max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics="ttft,tpot,itl,e2el" \ ---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh index 9cbef3276..c1ac421b6 100644 --- a/benchmarks/gptoss_fp4_mi325x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh @@ -57,16 +57,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh index 8209857bd..b26bf11b5 100644 --- a/benchmarks/gptoss_fp4_mi355x_docker.sh +++ b/benchmarks/gptoss_fp4_mi355x_docker.sh @@ -45,15 +45,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model=$MODEL --backend=vllm --base-url="http://localhost:$PORT" \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$NUM_PROMPTS \ ---max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics="ttft,tpot,itl,e2el" \ ---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$NUM_PROMPTS" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/gptoss_fp4_mi355x_slurm.sh b/benchmarks/gptoss_fp4_mi355x_slurm.sh index 1fcba771f..d378685db 100644 --- a/benchmarks/gptoss_fp4_mi355x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi355x_slurm.sh @@ -47,15 +47,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do done kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + set -x -BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) -git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR -python3 $BENCH_SERVING_DIR/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url "http://0.0.0.0:$PORT" \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics "ttft,tpot,itl,e2el" \ ---result-dir /workspace/ --result-filename $RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ From ebe3b626ae96bd301f41bff90d71496290a8a3d7 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 14 Nov 2025 16:09:36 -0600 Subject: [PATCH 043/214] add utils function for benchmark --- benchmarks/benchmark_lib.sh | 1 + benchmarks/gptoss_fp4_h100_docker.sh | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 152b5e4b6..f253f18f4 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -124,6 +124,7 @@ run_benchmark_serving() { git clone https://github.com/kimbochen/bench_serving.git "$BENCH_SERVING_DIR" # Run benchmark + set -x python3 "$BENCH_SERVING_DIR/benchmark_serving.py" \ --model "$model" \ --backend "$backend" \ diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index 42bbf6b1a..f2b17f990 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -52,7 +52,6 @@ pip install -q datasets pandas # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ From aa9070ffbedd7878bb3cc5ac1e25ea97a59f9ccf Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 14 Nov 2025 16:30:16 -0600 Subject: [PATCH 044/214] function-ize the waiting for server to start --- benchmarks/benchmark_lib.sh | 68 +++++++++++++++++++++++++ benchmarks/dsr1_fp4_b200_docker.sh | 16 +++--- benchmarks/dsr1_fp4_b200_trt_slurm.sh | 13 ++--- benchmarks/dsr1_fp4_mi355x_docker.sh | 12 ++--- benchmarks/dsr1_fp4_mi355x_slurm.sh | 12 ++--- benchmarks/dsr1_fp8_b200_docker.sh | 16 +++--- benchmarks/dsr1_fp8_b200_trt_slurm.sh | 12 ++--- benchmarks/dsr1_fp8_h200_slurm.sh | 12 ++--- benchmarks/dsr1_fp8_h200_trt_slurm.sh | 16 ++---- benchmarks/dsr1_fp8_mi300x_docker.sh | 3 ++ benchmarks/dsr1_fp8_mi300x_slurm.sh | 12 ++--- benchmarks/dsr1_fp8_mi325x_docker.sh | 12 ++--- benchmarks/dsr1_fp8_mi325x_slurm.sh | 12 ++--- benchmarks/dsr1_fp8_mi355x_docker.sh | 16 ++---- benchmarks/dsr1_fp8_mi355x_slurm.sh | 12 ++--- benchmarks/gptoss_fp4_b200_docker.sh | 16 ++---- benchmarks/gptoss_fp4_b200_trt_slurm.sh | 13 ++--- benchmarks/gptoss_fp4_h100_docker.sh | 20 ++------ benchmarks/gptoss_fp4_h100_slurm.sh | 16 +++--- benchmarks/gptoss_fp4_h200_slurm.sh | 12 ++--- benchmarks/gptoss_fp4_h200_trt_slurm.sh | 12 ++--- benchmarks/gptoss_fp4_mi300x_docker.sh | 12 ++--- benchmarks/gptoss_fp4_mi300x_slurm.sh | 12 ++--- benchmarks/gptoss_fp4_mi325x_docker.sh | 12 ++--- benchmarks/gptoss_fp4_mi325x_slurm.sh | 12 ++--- benchmarks/gptoss_fp4_mi355x_docker.sh | 12 ++--- benchmarks/gptoss_fp4_mi355x_slurm.sh | 12 ++--- 27 files changed, 170 insertions(+), 235 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index f253f18f4..133f9095f 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -2,6 +2,74 @@ # Shared benchmarking utilities for InferenceMAX +# Wait for server to be ready by polling the health endpoint +# All parameters are required +# Parameters: +# --port: Server port +# --server-log: Path to server log file +# --server-pid: Server process ID (required) +# --sleep-interval: Sleep interval between health checks (optional, default: 5) +wait_for_server_ready() { + local port="" + local server_log="" + local server_pid="" + local sleep_interval=5 + + # Parse arguments + while [[ $# -gt 0 ]]; do + case $1 in + --port) + port="$2" + shift 2 + ;; + --server-log) + server_log="$2" + shift 2 + ;; + --server-pid) + server_pid="$2" + shift 2 + ;; + --sleep-interval) + sleep_interval="$2" + shift 2 + ;; + *) + echo "Unknown parameter: $1" + return 1 + ;; + esac + done + + # Validate required parameters + if [[ -z "$port" ]]; then + echo "Error: --port is required" + return 1 + fi + if [[ -z "$server_log" ]]; then + echo "Error: --server-log is required" + return 1 + fi + if [[ -z "$server_pid" ]]; then + echo "Error: --server-pid is required" + return 1 + fi + + # Show logs until server is ready + tail -f "$server_log" & + local TAIL_PID=$! + set +x + until curl --output /dev/null --silent --fail http://0.0.0.0:$port/health; do + if ! kill -0 "$server_pid" 2>/dev/null; then + echo "Server died before becoming healthy. Exiting." + kill $TAIL_PID + exit 1 + fi + sleep "$sleep_interval" + done + kill $TAIL_PID +} + # Run benchmark serving with standardized parameters # All parameters are required # Parameters: diff --git a/benchmarks/dsr1_fp4_b200_docker.sh b/benchmarks/dsr1_fp4_b200_docker.sh index 317015ba8..5f4ab3c5c 100644 --- a/benchmarks/dsr1_fp4_b200_docker.sh +++ b/benchmarks/dsr1_fp4_b200_docker.sh @@ -26,20 +26,16 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0. --ep-size $EP_SIZE --quantization modelopt_fp4 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ --enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 > $SERVER_LOG 2>&1 & -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID - -pip install -q datasets pandas +SERVER_PID=$! # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh index 897ef8527..a9f7cc9d4 100644 --- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh @@ -100,19 +100,14 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \ --extra_llm_api_options=$EXTRA_CONFIG_FILE \ > $SERVER_LOG 2>&1 & - -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID +SERVER_PID=$! # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/dsr1_fp4_mi355x_docker.sh b/benchmarks/dsr1_fp4_mi355x_docker.sh index 05603ecae..eed9d1273 100644 --- a/benchmarks/dsr1_fp4_mi355x_docker.sh +++ b/benchmarks/dsr1_fp4_mi355x_docker.sh @@ -31,18 +31,14 @@ python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \ --max-prefill-tokens=$PREFILL_SIZE \ --cuda-graph-max-bs=128 > $SERVER_LOG 2>&1 & -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID +SERVER_PID=$! # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/dsr1_fp4_mi355x_slurm.sh b/benchmarks/dsr1_fp4_mi355x_slurm.sh index c47bbfb38..afb7ca29c 100644 --- a/benchmarks/dsr1_fp4_mi355x_slurm.sh +++ b/benchmarks/dsr1_fp4_mi355x_slurm.sh @@ -34,18 +34,12 @@ python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \ --cuda-graph-max-bs=128 \ > $SERVER_LOG 2>&1 & -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID - # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/dsr1_fp8_b200_docker.sh b/benchmarks/dsr1_fp8_b200_docker.sh index fa498ff3e..9a219339c 100644 --- a/benchmarks/dsr1_fp8_b200_docker.sh +++ b/benchmarks/dsr1_fp8_b200_docker.sh @@ -37,20 +37,16 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0. --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL --disable-radix-cache \ --attention-backend trtllm_mla --stream-interval 30 --ep-size $EP_SIZE --moe-runner-backend flashinfer_trtllm --quantization fp8 > $SERVER_LOG 2>&1 & -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID - -pip install -q datasets pandas +SERVER_PID=$! # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh index a22536d82..a78fece38 100644 --- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh @@ -72,18 +72,12 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \ SERVER_PID=$! -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID - # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/dsr1_fp8_h200_slurm.sh b/benchmarks/dsr1_fp8_h200_slurm.sh index 7444f763f..6eeec6df1 100644 --- a/benchmarks/dsr1_fp8_h200_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_slurm.sh @@ -44,18 +44,14 @@ else > $SERVER_LOG 2>&1 & fi -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID +SERVER_PID=$! # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh index 94baa7850..74b2ce8df 100644 --- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh @@ -72,22 +72,12 @@ PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \ SERVER_PID=$! -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - if ! kill -0 $SERVER_PID 2>/dev/null; then - echo "Server died before becoming healthy. Exiting." - exit 1 - fi - sleep 5 -done -kill $TAIL_PID - # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/dsr1_fp8_mi300x_docker.sh b/benchmarks/dsr1_fp8_mi300x_docker.sh index 5ffebb941..db27f4e74 100644 --- a/benchmarks/dsr1_fp8_mi300x_docker.sh +++ b/benchmarks/dsr1_fp8_mi300x_docker.sh @@ -50,6 +50,9 @@ kill $TAIL_PID # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/dsr1_fp8_mi300x_slurm.sh b/benchmarks/dsr1_fp8_mi300x_slurm.sh index ba1597982..0d191299c 100644 --- a/benchmarks/dsr1_fp8_mi300x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi300x_slurm.sh @@ -47,18 +47,14 @@ python3 -m sglang.launch_server \ --disable-radix-cache \ > $SERVER_LOG 2>&1 & -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID +SERVER_PID=$! # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/dsr1_fp8_mi325x_docker.sh b/benchmarks/dsr1_fp8_mi325x_docker.sh index c0f95846d..9f34e7563 100644 --- a/benchmarks/dsr1_fp8_mi325x_docker.sh +++ b/benchmarks/dsr1_fp8_mi325x_docker.sh @@ -28,18 +28,14 @@ python3 -m sglang.launch_server \ --max-prefill-tokens 196608 \ --cuda-graph-max-bs 128 > $SERVER_LOG 2>&1 & -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID +SERVER_PID=$! # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/dsr1_fp8_mi325x_slurm.sh b/benchmarks/dsr1_fp8_mi325x_slurm.sh index 1ccec681f..15e9cce64 100644 --- a/benchmarks/dsr1_fp8_mi325x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi325x_slurm.sh @@ -23,18 +23,12 @@ python3 -m sglang.launch_server \ --disable-radix-cache \ > $SERVER_LOG 2>&1 & -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID - # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/dsr1_fp8_mi355x_docker.sh b/benchmarks/dsr1_fp8_mi355x_docker.sh index 50d9bb02d..a86d7adbe 100644 --- a/benchmarks/dsr1_fp8_mi355x_docker.sh +++ b/benchmarks/dsr1_fp8_mi355x_docker.sh @@ -28,14 +28,11 @@ python3 -m sglang.launch_server \ --max-prefill-tokens 196608 \ --cuda-graph-max-bs 128 > $SERVER_LOG 2>&1 & -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" if [[ "$MODEL" == "amd/DeepSeek-R1-0528-MXFP4-Preview" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then if [[ "$OSL" == "8192" ]]; then @@ -47,9 +44,6 @@ else NUM_PROMPTS=$(( CONC * 10 )) fi -# Source benchmark utilities -source "$(dirname "$0")/benchmark_lib.sh" - set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/dsr1_fp8_mi355x_slurm.sh b/benchmarks/dsr1_fp8_mi355x_slurm.sh index 86b5f9649..54ba29fc0 100644 --- a/benchmarks/dsr1_fp8_mi355x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi355x_slurm.sh @@ -32,18 +32,12 @@ python3 -m sglang.launch_server \ --max-prefill-tokens 196608 \ --cuda-graph-max-bs 128 > $SERVER_LOG 2>&1 & -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID - # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh index 208ea278d..60c1a1582 100644 --- a/benchmarks/gptoss_fp4_b200_docker.sh +++ b/benchmarks/gptoss_fp4_b200_docker.sh @@ -53,20 +53,14 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \ SERVER_PID=$! -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID - -pip install -q datasets pandas - # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh index 4647cb346..0ec2f325f 100644 --- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh @@ -78,19 +78,14 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \ --extra_llm_api_options=$EXTRA_CONFIG_FILE \ > $SERVER_LOG 2>&1 & - -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID +SERVER_PID=$! # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index f2b17f990..9cf7c5275 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -34,24 +34,14 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ SERVER_PID=$! -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - if ! kill -0 $SERVER_PID 2>/dev/null; then - echo "Server died before becoming healthy. Exiting." - exit 1 - fi - sleep 5 -done -kill $TAIL_PID - -pip install -q datasets pandas - # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh index 5f31f0abf..c3d598116 100644 --- a/benchmarks/gptoss_fp4_h100_slurm.sh +++ b/benchmarks/gptoss_fp4_h100_slurm.sh @@ -35,20 +35,16 @@ PYTHONNOUSERSITE=1 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --max-num-seqs=$CONC \ --disable-log-requests > $SERVER_LOG 2>&1 & -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID - -pip install -q datasets pandas +SERVER_PID=$! # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh index 146ab16a5..a3e47ca44 100644 --- a/benchmarks/gptoss_fp4_h200_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_slurm.sh @@ -48,18 +48,14 @@ PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC \ --disable-log-requests > $SERVER_LOG 2>&1 & -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID +SERVER_PID=$! # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh index ffe6e65de..81f1f67de 100644 --- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh @@ -59,18 +59,14 @@ trtllm-serve $MODEL \ --pp_size=1 \ > $SERVER_LOG 2>&1 & -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID +SERVER_PID=$! # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh index a3b1dc4f3..003ebf90e 100644 --- a/benchmarks/gptoss_fp4_mi300x_docker.sh +++ b/benchmarks/gptoss_fp4_mi300x_docker.sh @@ -38,18 +38,12 @@ vllm serve $MODEL --port $PORT \ --disable-log-requests \ --async-scheduling > $SERVER_LOG 2>&1 & -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID - # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/gptoss_fp4_mi300x_slurm.sh b/benchmarks/gptoss_fp4_mi300x_slurm.sh index 053c79197..a9e164cc2 100644 --- a/benchmarks/gptoss_fp4_mi300x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi300x_slurm.sh @@ -48,18 +48,14 @@ vllm serve $MODEL --port $PORT \ --async-scheduling \ > $SERVER_LOG 2>&1 & -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID +SERVER_PID=$! # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/gptoss_fp4_mi325x_docker.sh b/benchmarks/gptoss_fp4_mi325x_docker.sh index 62d2c5bd0..a000b462f 100644 --- a/benchmarks/gptoss_fp4_mi325x_docker.sh +++ b/benchmarks/gptoss_fp4_mi325x_docker.sh @@ -37,18 +37,12 @@ vllm serve $MODEL --port $PORT \ --disable-log-requests \ --async-scheduling > $SERVER_LOG 2>&1 & -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID - # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh index c1ac421b6..a9dbff484 100644 --- a/benchmarks/gptoss_fp4_mi325x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh @@ -48,18 +48,12 @@ vllm serve $MODEL --port $PORT \ --async-scheduling \ > $SERVER_LOG 2>&1 & -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID - # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh index b26bf11b5..e7399694f 100644 --- a/benchmarks/gptoss_fp4_mi355x_docker.sh +++ b/benchmarks/gptoss_fp4_mi355x_docker.sh @@ -36,18 +36,12 @@ vllm serve $MODEL --port $PORT \ --disable-log-requests \ --async-scheduling > $SERVER_LOG 2>&1 & -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID - # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + set -x run_benchmark_serving \ --model "$MODEL" \ diff --git a/benchmarks/gptoss_fp4_mi355x_slurm.sh b/benchmarks/gptoss_fp4_mi355x_slurm.sh index d378685db..2f4d84927 100644 --- a/benchmarks/gptoss_fp4_mi355x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi355x_slurm.sh @@ -38,18 +38,12 @@ vllm serve $MODEL --port $PORT \ --disable-log-requests \ --async-scheduling > $SERVER_LOG 2>&1 & -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID - # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + set -x run_benchmark_serving \ --model "$MODEL" \ From 0d2c112775ed23d87722c923253a63e49d3fdeec Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 14 Nov 2025 16:31:55 -0600 Subject: [PATCH 045/214] dont show arg parsing set -x --- benchmarks/benchmark_lib.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 133f9095f..85d221a92 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -10,6 +10,7 @@ # --server-pid: Server process ID (required) # --sleep-interval: Sleep interval between health checks (optional, default: 5) wait_for_server_ready() { + set -x local port="" local server_log="" local server_pid="" From 271091d3e047fab771eabf5b6fada8a75cffa037 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 14 Nov 2025 16:36:10 -0600 Subject: [PATCH 046/214] dont show arg parsing set +x oops --- benchmarks/benchmark_lib.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 85d221a92..8e52b949d 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -10,7 +10,7 @@ # --server-pid: Server process ID (required) # --sleep-interval: Sleep interval between health checks (optional, default: 5) wait_for_server_ready() { - set -x + set +x local port="" local server_log="" local server_pid="" From 898b132bd48b430c97bf4c8ecdaa322ba80e410f Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 14 Nov 2025 16:44:58 -0600 Subject: [PATCH 047/214] dont show arg parsing set +x oops --- benchmarks/benchmark_lib.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 8e52b949d..cc3448d40 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -59,7 +59,6 @@ wait_for_server_ready() { # Show logs until server is ready tail -f "$server_log" & local TAIL_PID=$! - set +x until curl --output /dev/null --silent --fail http://0.0.0.0:$port/health; do if ! kill -0 "$server_pid" 2>/dev/null; then echo "Server died before becoming healthy. Exiting." @@ -85,6 +84,7 @@ wait_for_server_ready() { # --result-filename: Result filename without extension # --result-dir: Result directory run_benchmark_serving() { + set +x local model="" local port="" local backend="" @@ -210,4 +210,5 @@ run_benchmark_serving() { --percentile-metrics 'ttft,tpot,itl,e2el' \ --result-dir "$result_dir" \ --result-filename "$result_filename.json" + set +x } From fd2e33e29c4042f311ba3c95590fe97a3e3ce04c Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 14 Nov 2025 17:04:18 -0600 Subject: [PATCH 048/214] capture server pid --- benchmarks/dsr1_fp4_b200_docker.sh | 1 - benchmarks/dsr1_fp4_b200_trt_slurm.sh | 1 - benchmarks/dsr1_fp4_mi355x_docker.sh | 1 - benchmarks/dsr1_fp4_mi355x_slurm.sh | 3 ++- benchmarks/dsr1_fp8_b200_docker.sh | 1 - benchmarks/dsr1_fp8_b200_trt_slurm.sh | 1 - benchmarks/dsr1_fp8_h200_slurm.sh | 1 - benchmarks/dsr1_fp8_h200_trt_slurm.sh | 1 - benchmarks/dsr1_fp8_mi300x_docker.sh | 11 +---------- benchmarks/dsr1_fp8_mi300x_slurm.sh | 1 - benchmarks/dsr1_fp8_mi325x_docker.sh | 1 - benchmarks/dsr1_fp8_mi325x_slurm.sh | 3 ++- benchmarks/dsr1_fp8_mi355x_docker.sh | 3 ++- benchmarks/dsr1_fp8_mi355x_slurm.sh | 3 ++- benchmarks/gptoss_fp4_b200_docker.sh | 1 - benchmarks/gptoss_fp4_h100_slurm.sh | 1 - benchmarks/gptoss_fp4_h200_slurm.sh | 1 - benchmarks/gptoss_fp4_h200_trt_slurm.sh | 1 - benchmarks/gptoss_fp4_mi300x_docker.sh | 3 ++- benchmarks/gptoss_fp4_mi325x_docker.sh | 3 ++- benchmarks/gptoss_fp4_mi325x_slurm.sh | 3 ++- benchmarks/gptoss_fp4_mi355x_docker.sh | 3 ++- benchmarks/gptoss_fp4_mi355x_slurm.sh | 3 ++- 23 files changed, 19 insertions(+), 32 deletions(-) diff --git a/benchmarks/dsr1_fp4_b200_docker.sh b/benchmarks/dsr1_fp4_b200_docker.sh index 5f4ab3c5c..a520871fa 100644 --- a/benchmarks/dsr1_fp4_b200_docker.sh +++ b/benchmarks/dsr1_fp4_b200_docker.sh @@ -36,7 +36,6 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S pip install -q datasets pandas -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh index a9f7cc9d4..b4227e428 100644 --- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh @@ -108,7 +108,6 @@ source "$(dirname "$0")/benchmark_lib.sh" # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/dsr1_fp4_mi355x_docker.sh b/benchmarks/dsr1_fp4_mi355x_docker.sh index eed9d1273..f19b6df2e 100644 --- a/benchmarks/dsr1_fp4_mi355x_docker.sh +++ b/benchmarks/dsr1_fp4_mi355x_docker.sh @@ -39,7 +39,6 @@ source "$(dirname "$0")/benchmark_lib.sh" # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/dsr1_fp4_mi355x_slurm.sh b/benchmarks/dsr1_fp4_mi355x_slurm.sh index afb7ca29c..f4d7f1d39 100644 --- a/benchmarks/dsr1_fp4_mi355x_slurm.sh +++ b/benchmarks/dsr1_fp4_mi355x_slurm.sh @@ -34,13 +34,14 @@ python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \ --cuda-graph-max-bs=128 \ > $SERVER_LOG 2>&1 & +SERVER_PID=$! + # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/dsr1_fp8_b200_docker.sh b/benchmarks/dsr1_fp8_b200_docker.sh index 9a219339c..ffa7644bd 100644 --- a/benchmarks/dsr1_fp8_b200_docker.sh +++ b/benchmarks/dsr1_fp8_b200_docker.sh @@ -47,7 +47,6 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S pip install -q datasets pandas -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh index a78fece38..a9a1a04ff 100644 --- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh @@ -78,7 +78,6 @@ source "$(dirname "$0")/benchmark_lib.sh" # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/dsr1_fp8_h200_slurm.sh b/benchmarks/dsr1_fp8_h200_slurm.sh index 6eeec6df1..06345ecb2 100644 --- a/benchmarks/dsr1_fp8_h200_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_slurm.sh @@ -52,7 +52,6 @@ source "$(dirname "$0")/benchmark_lib.sh" # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh index 74b2ce8df..4ece6f7bc 100644 --- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh @@ -78,7 +78,6 @@ source "$(dirname "$0")/benchmark_lib.sh" # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/dsr1_fp8_mi300x_docker.sh b/benchmarks/dsr1_fp8_mi300x_docker.sh index db27f4e74..8c269dd83 100644 --- a/benchmarks/dsr1_fp8_mi300x_docker.sh +++ b/benchmarks/dsr1_fp8_mi300x_docker.sh @@ -37,15 +37,7 @@ python3 -m sglang.launch_server \ --max-prefill-tokens=196608 \ --disable-radix-cache > $SERVER_LOG 2>&1 & - -# Show logs until server is ready -tail -f $SERVER_LOG & -TAIL_PID=$! -set +x -until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do - sleep 5 -done -kill $TAIL_PID +SERVER_PID=$! # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" @@ -53,7 +45,6 @@ source "$(dirname "$0")/benchmark_lib.sh" # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/dsr1_fp8_mi300x_slurm.sh b/benchmarks/dsr1_fp8_mi300x_slurm.sh index 0d191299c..5fad7a587 100644 --- a/benchmarks/dsr1_fp8_mi300x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi300x_slurm.sh @@ -55,7 +55,6 @@ source "$(dirname "$0")/benchmark_lib.sh" # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/dsr1_fp8_mi325x_docker.sh b/benchmarks/dsr1_fp8_mi325x_docker.sh index 9f34e7563..565b8fb45 100644 --- a/benchmarks/dsr1_fp8_mi325x_docker.sh +++ b/benchmarks/dsr1_fp8_mi325x_docker.sh @@ -36,7 +36,6 @@ source "$(dirname "$0")/benchmark_lib.sh" # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/dsr1_fp8_mi325x_slurm.sh b/benchmarks/dsr1_fp8_mi325x_slurm.sh index 15e9cce64..67e4cc394 100644 --- a/benchmarks/dsr1_fp8_mi325x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi325x_slurm.sh @@ -23,13 +23,14 @@ python3 -m sglang.launch_server \ --disable-radix-cache \ > $SERVER_LOG 2>&1 & +SERVER_PID=$! + # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/dsr1_fp8_mi355x_docker.sh b/benchmarks/dsr1_fp8_mi355x_docker.sh index a86d7adbe..d4f1dd013 100644 --- a/benchmarks/dsr1_fp8_mi355x_docker.sh +++ b/benchmarks/dsr1_fp8_mi355x_docker.sh @@ -28,6 +28,8 @@ python3 -m sglang.launch_server \ --max-prefill-tokens 196608 \ --cuda-graph-max-bs 128 > $SERVER_LOG 2>&1 & +SERVER_PID=$! + # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" @@ -44,7 +46,6 @@ else NUM_PROMPTS=$(( CONC * 10 )) fi -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/dsr1_fp8_mi355x_slurm.sh b/benchmarks/dsr1_fp8_mi355x_slurm.sh index 54ba29fc0..fd6fe49fb 100644 --- a/benchmarks/dsr1_fp8_mi355x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi355x_slurm.sh @@ -32,13 +32,14 @@ python3 -m sglang.launch_server \ --max-prefill-tokens 196608 \ --cuda-graph-max-bs 128 > $SERVER_LOG 2>&1 & +SERVER_PID=$! + # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh index 60c1a1582..1736701c4 100644 --- a/benchmarks/gptoss_fp4_b200_docker.sh +++ b/benchmarks/gptoss_fp4_b200_docker.sh @@ -61,7 +61,6 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S pip install -q datasets pandas -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh index c3d598116..843219b95 100644 --- a/benchmarks/gptoss_fp4_h100_slurm.sh +++ b/benchmarks/gptoss_fp4_h100_slurm.sh @@ -45,7 +45,6 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S pip install -q datasets pandas -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh index a3e47ca44..dc29baf8d 100644 --- a/benchmarks/gptoss_fp4_h200_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_slurm.sh @@ -56,7 +56,6 @@ source "$(dirname "$0")/benchmark_lib.sh" # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh index 81f1f67de..21d6ae02c 100644 --- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh @@ -67,7 +67,6 @@ source "$(dirname "$0")/benchmark_lib.sh" # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh index 003ebf90e..7d1f98226 100644 --- a/benchmarks/gptoss_fp4_mi300x_docker.sh +++ b/benchmarks/gptoss_fp4_mi300x_docker.sh @@ -38,13 +38,14 @@ vllm serve $MODEL --port $PORT \ --disable-log-requests \ --async-scheduling > $SERVER_LOG 2>&1 & +SERVER_PID=$! + # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/gptoss_fp4_mi325x_docker.sh b/benchmarks/gptoss_fp4_mi325x_docker.sh index a000b462f..46462ad6d 100644 --- a/benchmarks/gptoss_fp4_mi325x_docker.sh +++ b/benchmarks/gptoss_fp4_mi325x_docker.sh @@ -37,13 +37,14 @@ vllm serve $MODEL --port $PORT \ --disable-log-requests \ --async-scheduling > $SERVER_LOG 2>&1 & +SERVER_PID=$! + # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh index a9dbff484..f15e6261c 100644 --- a/benchmarks/gptoss_fp4_mi325x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh @@ -48,13 +48,14 @@ vllm serve $MODEL --port $PORT \ --async-scheduling \ > $SERVER_LOG 2>&1 & +SERVER_PID=$! + # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh index e7399694f..0e54245d4 100644 --- a/benchmarks/gptoss_fp4_mi355x_docker.sh +++ b/benchmarks/gptoss_fp4_mi355x_docker.sh @@ -36,13 +36,14 @@ vllm serve $MODEL --port $PORT \ --disable-log-requests \ --async-scheduling > $SERVER_LOG 2>&1 & +SERVER_PID=$! + # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ diff --git a/benchmarks/gptoss_fp4_mi355x_slurm.sh b/benchmarks/gptoss_fp4_mi355x_slurm.sh index 2f4d84927..a2adf2952 100644 --- a/benchmarks/gptoss_fp4_mi355x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi355x_slurm.sh @@ -38,13 +38,14 @@ vllm serve $MODEL --port $PORT \ --disable-log-requests \ --async-scheduling > $SERVER_LOG 2>&1 & +SERVER_PID=$! + # Source benchmark utilities source "$(dirname "$0")/benchmark_lib.sh" # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ From 2a4faf5c9f26f02ee2f9e98c4fd1335b1c140fde Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 14 Nov 2025 17:50:57 -0600 Subject: [PATCH 049/214] Squash-merge bryan/eval into refactor-docker-runner-launch --- .github/workflows/eval-h100-gms8k.yml | 59 +++++++++ .github/workflows/eval-tmpl.yml | 136 +++++++++++++++++++ benchmarks/dsr1_fp4_mi355x_slurm.sh | 137 +++++++++++++++++++ benchmarks/gptoss_fp4_h100_docker.sh | 8 +- benchmarks/gptoss_fp4_h100_slurm.sh | 12 +- benchmarks/gptoss_fp4_h200_slurm.sh | 2 +- benchmarks/gptoss_fp4_mi300x_slurm.sh | 182 ++++++++++++++++++++++++++ benchmarks/gptoss_fp4_mi325x_slurm.sh | 3 +- benchmarks/gptoss_fp4_mi355x_slurm.sh | 2 + 9 files changed, 532 insertions(+), 9 deletions(-) create mode 100644 .github/workflows/eval-h100-gms8k.yml create mode 100644 .github/workflows/eval-tmpl.yml diff --git a/.github/workflows/eval-h100-gms8k.yml b/.github/workflows/eval-h100-gms8k.yml new file mode 100644 index 000000000..84723f596 --- /dev/null +++ b/.github/workflows/eval-h100-gms8k.yml @@ -0,0 +1,59 @@ +name: Eval - GSM8K on H100 (PoC) + +on: + workflow_dispatch: + inputs: + image: + description: "Serving image" + required: false + type: string + default: "vllm/vllm-openai:v0.11.0" + model: + description: "Model" + required: false + type: string + default: "openai/gpt-oss-120b" + tp: + description: "Tensor Parallel Size" + required: false + type: string + default: "4" + port: + description: "Server port" + required: false + type: string + default: "8888" + num_fewshot: + description: "Fewshot k for GSM8K" + required: false + type: string + default: "5" + limit: + description: "Sample limit for GSM8K" + required: false + type: string + default: "200" + push: + paths: + - '.github/workflows/eval-h100-gms8k.yml' + - '.github/workflows/eval-tmpl.yml' + - 'benchmarks/dsr1_fp8_mi325x_slurm.sh' + +jobs: + eval: + uses: ./.github/workflows/eval-tmpl.yml + secrets: inherit + with: + runner: mi325x-tw_1 + image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' }} + model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }} + framework: sglang + precision: fp8 + exp-name: dsr1_gsm8k_poc + tp: ${{ inputs.tp || '4' }} + ep: '1' + dp-attn: false + port: ${{ inputs.port || '8888' }} + eval-task: gsm8k + num-fewshot: ${{ inputs.num_fewshot || '5' }} + limit: ${{ inputs.limit || '200' }} diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml new file mode 100644 index 000000000..9c4e77c78 --- /dev/null +++ b/.github/workflows/eval-tmpl.yml @@ -0,0 +1,136 @@ +name: Template - Eval + +on: + workflow_call: + inputs: + runner: + required: true + type: string + image: + required: true + type: string + model: + required: true + type: string + framework: + required: true + type: string + precision: + required: true + type: string + exp-name: + required: true + type: string + tp: + required: true + type: string + ep: + required: false + type: string + default: '1' + dp-attn: + required: false + type: boolean + default: false + port: + required: false + type: string + default: '8888' + eval-task: + required: true + type: string + num-fewshot: + required: false + type: string + default: '5' + limit: + required: false + type: string + default: '200' + +env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + HF_HUB_CACHE: '/mnt/hf_hub_cache/' + EXP_NAME: ${{ inputs.exp-name }} + MODEL: ${{ inputs.model }} + IMAGE: ${{ inputs.image }} + FRAMEWORK: ${{ inputs.framework }} + PRECISION: ${{ inputs.precision }} + TP: ${{ inputs.tp }} + EP_SIZE: ${{ inputs.ep }} + DP_ATTENTION: ${{ inputs.dp-attn }} + PORT: ${{ inputs.port }} + EVAL_TASK: ${{ inputs['eval-task'] }} + NUM_FEWSHOT: ${{ inputs['num-fewshot'] }} + LIMIT: ${{ inputs.limit }} + EVAL_RESULT_DIR: eval_out + # Server-side concurrency default (used by some server scripts) + CONC: '32' + MAX_MODEL_LEN: '8192' + ISL: 1024 + OSL: 8192 + RANDOM_RANGE_RATIO: '1.0' + RESULT_FILENAME: results + +jobs: + eval: + runs-on: ${{ inputs.runner }} + timeout-minutes: 180 + name: "Eval ${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.precision }} tp=${{ inputs.tp }} task=${{ inputs['eval-task'] }} limit=${{ inputs.limit }}" + steps: + - name: Resource cleanup + run: | + if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then + host=$(hostname) + + if [[ "$host" == "b200-81" || "$host" == "b200-80" || "$host" == "b200-79" ]]; then + echo "[INFO] Running container-by-container cleanup on $host" + for cid in $(docker ps -aq); do + echo "[INFO] Cleaning container $cid" + docker stop -t 90 "$cid" || true + docker wait "$cid" >/dev/null 2>&1 || true + docker rm -f "$cid" >/dev/null 2>&1 || true + done + sleep 2 + if nvidia-smi --query-compute-apps=pid --format=csv,noheader | grep -q '[0-9]'; then + echo "[WARN] After stop, GPU still busy:" + nvidia-smi || true + fi + else + echo "[Docker] looking at docker resources ..." + docker ps -aq + fi + fi + if command -v squeue >/dev/null 2>&1; then + echo "[Slurm] Cleaning up resources ..." + scancel -u $USER || true + while [ -n "$(squeue -u $USER --noheader --format='%i')" ]; do + squeue -u $USER || true + sleep 5 + done + fi + + - uses: actions/checkout@v5 + with: + fetch-depth: 0 + # Avoid aggressive workspace deletion if stale, rely on git reset/clean later + clean: false + + - name: Launch eval via runner script + env: + RUNNER_NAME: ${{ runner.name }} + RUN_MODE: eval + # Optional: structured filename if runner chooses to use it later + EVAL_RESULT_BASENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_${{ runner.name }} + run: | + bash ./runners/launch_${RUNNER_NAME%%_*}.sh + + - name: Upload eval artifacts + if: always() + uses: actions/upload-artifact@v5 + with: + name: eval_${{ env.EXP_NAME }}_${{ runner.name }} + path: | + ${{ env.EVAL_RESULT_DIR }}/ + ${{ env.EVAL_RESULT_DIR }}/* + ${{ env.EVAL_RESULT_DIR }}/** diff --git a/benchmarks/dsr1_fp4_mi355x_slurm.sh b/benchmarks/dsr1_fp4_mi355x_slurm.sh index f4d7f1d39..b0f1c33c0 100644 --- a/benchmarks/dsr1_fp4_mi355x_slurm.sh +++ b/benchmarks/dsr1_fp4_mi355x_slurm.sh @@ -54,3 +54,140 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +####### + +# +## Evals setup +# !TODO clean env vars +EVAL_RESULT_DIR=${EVAL_RESULT_DIR:-eval_out} +OPENAI_SERVER_BASE="http://0.0.0.0:${PORT}" +OPENAI_COMP_BASE="$OPENAI_SERVER_BASE/v1/completions" +OPENAI_CHAT_BASE="$OPENAI_SERVER_BASE/v1/chat/completions" +export OPENAI_API_KEY=${OPENAI_API_KEY:-EMPTY} + +# Patch to convert bypass regex error if content field is empty +PATCH_DIR="$(mktemp -d)" +cat > "$PATCH_DIR/sitecustomize.py" <<'PY' +import re, sys, unicodedata +from lm_eval.filters import extraction as ex + +def _s(x): # coerce to str + return x if isinstance(x, str) else "" + +# --- Patch RegexFilter.apply (used by many datasets) --- +_orig_regex_apply = ex.RegexFilter.apply +def _safe_regex_apply(self, resps, docs): + out = [] + for inst in resps: # inst is a list of candidate responses for one doc + filtered = [] + for resp in inst: + txt = _s(resp) + m = self.regex.findall(txt) + if m: + m = m[self.group_select] + if isinstance(m, tuple): + m = [t for t in m if t] + m = m[0] if m else self.fallback + m = m.strip() + else: + m = self.fallback + filtered.append(m) + out.append(filtered) + return out +ex.RegexFilter.apply = _safe_regex_apply + +# --- Patch MultiChoiceRegexFilter.apply (used by GSM8K flexible-extract) --- +_orig_mc_apply = ex.MultiChoiceRegexFilter.apply +def _safe_mc_apply(self, resps, docs): + def find_match(regex, resp, convert_dict={}): + txt = _s(resp) + match = regex.findall(txt) + if match: + match = match[self.group_select] + if isinstance(match, tuple): + match = [m for m in match if m] + if match: + match = match[0] + if match: + match = match.strip() + if match in convert_dict: + return convert_dict[match] + return match + return None + + punct_tbl = dict.fromkeys( + i for i in range(sys.maxunicode) + if unicodedata.category(chr(i)).startswith("P") + ) + + def filter_ignores(st): + st = _s(st) + if self.regexes_to_ignore is not None: + for s in self.regexes_to_ignore: + st = re.sub(s, "", st) + if self.ignore_case: + st = st.lower() + if self.ignore_punctuation: + st = st.translate(punct_tbl) + return st + + out = [] + for r, doc in zip(resps, docs): + # Build fallback regexes from choices (A, B, C, ...) as in upstream + fallback_regexes, choice_to_alpha = [], {} + next_alpha = "A" + without_paren, without_paren_to_target = [], {} + for c in doc.get("choices", []): + m = filter_ignores(c.strip()) + fallback_regexes.append(re.escape(m)) + choice_to_alpha[m] = f"({next_alpha})" + without_paren.append(next_alpha) + without_paren_to_target[next_alpha] = f"({next_alpha})" + next_alpha = chr(ord(next_alpha) + 1) + + fallback_regex = re.compile("|".join(fallback_regexes)) if fallback_regexes else None + without_paren_regex = re.compile(rf":[\s]*({'|'.join(without_paren)})") if without_paren else None + + filtered = [] + for resp in r: + m = find_match(self.regex, resp) + if not m and fallback_regex: + m = find_match(fallback_regex, filter_ignores(resp), choice_to_alpha) + if not m and without_paren_regex: + m = find_match(without_paren_regex, resp, without_paren_to_target) + if not m: + m = self.fallback + filtered.append(m) + out.append(filtered) + return out + +ex.MultiChoiceRegexFilter.apply = _safe_mc_apply +PY + +export PYTHONPATH="${PATCH_DIR}:${PYTHONPATH:-}" +set -x +python3 -m lm_eval --model local-chat-completions --apply_chat_template \ +--tasks ${EVAL_TASK:-gsm8k} \ +--num_fewshot ${NUM_FEWSHOT:-5} \ +--batch_size 2 \ +--output_path "/workspace/${EVAL_RESULT_DIR}" \ +--model_args "model=$MODEL,base_url=$OPENAI_CHAT_BASE,api_key=$OPENAI_API_KEY,eos_string=,max_retries=3,num_concurrent=32,tokenized_requests=False" \ +--gen_kwargs "max_tokens=8192,temperature=0,top_p=1" +set +x + +# Append a Markdown table to the GitHub Actions job summary using helper in bench_serving +if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then +python3 bench_serving/lm_eval_to_md.py \ + --results-dir "/workspace/${EVAL_RESULT_DIR}" \ + --task "${EVAL_TASK:-gsm8k}" \ + --framework "${FRAMEWORK}" \ + --precision "${PRECISION}" \ + --tp "${TP:-1}" \ + --ep "${EP_SIZE:-1}" \ + --dp-attention "${DP_ATTENTION:-false}" \ + >> "$GITHUB_STEP_SUMMARY" || true +fi + +echo "Evaluation completed. Results in /workspace/${EVAL_RESULT_DIR}" +exit 0 diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index 9cf7c5275..e8f7bdd1d 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -1,9 +1,11 @@ #!/usr/bin/env bash -# === Required Env Vars === +# === Required Env Vars === # HF_TOKEN # HF_HUB_CACHE # MODEL +# ISL +# OSL # MAX_MODEL_LEN # RANDOM_RANGE_RATIO # TP @@ -12,6 +14,7 @@ # OSL +# Create a basic vLLM config cat > config.yaml << EOF compilation-config: '{"cudagraph_mode":"PIECEWISE"}' async-scheduling: true @@ -24,6 +27,7 @@ EOF export PYTHONNOUSERSITE=1 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +# Start server in the background, shld be openai/gpt-oss-120b set -x vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --config config.yaml \ @@ -52,4 +56,4 @@ run_benchmark_serving \ --num-prompts $(( $CONC * 10 )) \ --max-concurrency 512 \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ No newline at end of file + --result-dir /workspace/ diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh index 843219b95..82fad1dd7 100644 --- a/benchmarks/gptoss_fp4_h100_slurm.sh +++ b/benchmarks/gptoss_fp4_h100_slurm.sh @@ -26,14 +26,16 @@ EOF SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) export TORCH_CUDA_ARCH_LIST="9.0" +PORT=${PORT:-8888} set -x PYTHONNOUSERSITE=1 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ ---config config.yaml \ ---gpu-memory-utilization=0.9 \ ---tensor-parallel-size=$TP \ ---max-num-seqs=$CONC \ ---disable-log-requests > $SERVER_LOG 2>&1 & + --config config.yaml \ + --gpu-memory-utilization=0.9 \ + --tensor-parallel-size=$TP \ + --max-num-seqs=$CONC \ + --disable-log-requests \ + > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh index dc29baf8d..f87361ffd 100644 --- a/benchmarks/gptoss_fp4_h200_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_slurm.sh @@ -16,7 +16,6 @@ echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -set -x hf download $MODEL pip install datasets pandas @@ -40,6 +39,7 @@ max-model-len: $CALCULATED_MAX_MODEL_LEN EOF SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +export TORCH_CUDA_ARCH_LIST="9.0" PORT=$(( 8888 + $PORT_OFFSET )) export TORCH_CUDA_ARCH_LIST="9.0" diff --git a/benchmarks/gptoss_fp4_mi300x_slurm.sh b/benchmarks/gptoss_fp4_mi300x_slurm.sh index a9e164cc2..eadfa16b3 100644 --- a/benchmarks/gptoss_fp4_mi300x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi300x_slurm.sh @@ -35,6 +35,8 @@ export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 +# +## Start up vllm server set -x vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ @@ -67,3 +69,183 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ +set +x + +# +## Ensure benching scripts present +git config --global --add safe.directory /workspace || true +if [[ ! -d bench_serving ]]; then + git clone https://github.com/kimbochen/bench_serving.git +fi + +# +## Deps for lm-eval +#python3 -m pip install -q --upgrade pip || true +python3 -m pip install -q --no-cache-dir "lm-eval[api]" || true +# Temporary: workaround known harness issue +python3 -m pip install -q --no-cache-dir --no-deps "git+https://github.com/EleutherAI/lm-evaluation-harness.git@main" || true + +# +## Wait for vllm server to start up +while IFS= read -r line; do + printf '%s\n' "$line" + if [[ "$line" == *"Application startup complete"* ]]; then + break + fi +done < <(tail -F -n0 "$SERVER_LOG") + +# +## Run benchmark +set -x +python3 bench_serving/benchmark_serving.py \ +--model $MODEL \ +--backend vllm \ +--base-url http://0.0.0.0:$PORT \ +--dataset-name random \ +--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ +--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ +--request-rate inf --ignore-eos \ +--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ +--result-dir /workspace/ \ +--result-filename $RESULT_FILENAME.json +set +x + +####### + +# +## Evals setup +# !TODO clean env vars +EVAL_RESULT_DIR=${EVAL_RESULT_DIR:-eval_out} +OPENAI_SERVER_BASE="http://0.0.0.0:${PORT}" +OPENAI_COMP_BASE="$OPENAI_SERVER_BASE/v1/completions" +OPENAI_CHAT_BASE="$OPENAI_SERVER_BASE/v1/chat/completions" +export OPENAI_API_KEY=${OPENAI_API_KEY:-EMPTY} + +# Patch to convert bypass regex error if content field is empty +PATCH_DIR="$(mktemp -d)" +cat > "$PATCH_DIR/sitecustomize.py" <<'PY' +import re, sys, unicodedata +from lm_eval.filters import extraction as ex + +def _s(x): # coerce to str + return x if isinstance(x, str) else "" + +# --- Patch RegexFilter.apply (used by many datasets) --- +_orig_regex_apply = ex.RegexFilter.apply +def _safe_regex_apply(self, resps, docs): + out = [] + for inst in resps: # inst is a list of candidate responses for one doc + filtered = [] + for resp in inst: + txt = _s(resp) + m = self.regex.findall(txt) + if m: + m = m[self.group_select] + if isinstance(m, tuple): + m = [t for t in m if t] + m = m[0] if m else self.fallback + m = m.strip() + else: + m = self.fallback + filtered.append(m) + out.append(filtered) + return out +ex.RegexFilter.apply = _safe_regex_apply + +# --- Patch MultiChoiceRegexFilter.apply (used by GSM8K flexible-extract) --- +_orig_mc_apply = ex.MultiChoiceRegexFilter.apply +def _safe_mc_apply(self, resps, docs): + def find_match(regex, resp, convert_dict={}): + txt = _s(resp) + match = regex.findall(txt) + if match: + match = match[self.group_select] + if isinstance(match, tuple): + match = [m for m in match if m] + if match: + match = match[0] + if match: + match = match.strip() + if match in convert_dict: + return convert_dict[match] + return match + return None + + punct_tbl = dict.fromkeys( + i for i in range(sys.maxunicode) + if unicodedata.category(chr(i)).startswith("P") + ) + + def filter_ignores(st): + st = _s(st) + if self.regexes_to_ignore is not None: + for s in self.regexes_to_ignore: + st = re.sub(s, "", st) + if self.ignore_case: + st = st.lower() + if self.ignore_punctuation: + st = st.translate(punct_tbl) + return st + + out = [] + for r, doc in zip(resps, docs): + # Build fallback regexes from choices (A, B, C, ...) as in upstream + fallback_regexes, choice_to_alpha = [], {} + next_alpha = "A" + without_paren, without_paren_to_target = [], {} + for c in doc.get("choices", []): + m = filter_ignores(c.strip()) + fallback_regexes.append(re.escape(m)) + choice_to_alpha[m] = f"({next_alpha})" + without_paren.append(next_alpha) + without_paren_to_target[next_alpha] = f"({next_alpha})" + next_alpha = chr(ord(next_alpha) + 1) + + fallback_regex = re.compile("|".join(fallback_regexes)) if fallback_regexes else None + without_paren_regex = re.compile(rf":[\s]*({'|'.join(without_paren)})") if without_paren else None + + filtered = [] + for resp in r: + m = find_match(self.regex, resp) + if not m and fallback_regex: + m = find_match(fallback_regex, filter_ignores(resp), choice_to_alpha) + if not m and without_paren_regex: + m = find_match(without_paren_regex, resp, without_paren_to_target) + if not m: + m = self.fallback + filtered.append(m) + out.append(filtered) + return out + +ex.MultiChoiceRegexFilter.apply = _safe_mc_apply +PY + +export PYTHONPATH="${PATCH_DIR}:${PYTHONPATH:-}" +set -x +python3 -m lm_eval --model local-chat-completions --apply_chat_template \ +--tasks ${EVAL_TASK:-gsm8k} \ +--num_fewshot ${NUM_FEWSHOT:-5} \ +--batch_size 2 \ +--output_path "/workspace/${EVAL_RESULT_DIR}" \ +--model_args "model=$MODEL,base_url=$OPENAI_CHAT_BASE,api_key=$OPENAI_API_KEY,eos_string=,max_retries=3,num_concurrent=32,tokenized_requests=False" \ +--gen_kwargs "max_tokens=8192,temperature=0,top_p=1" +set +x + +# Append a Markdown table to the GitHub Actions job summary using helper in bench_serving +if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then +python3 bench_serving/lm_eval_to_md.py \ + --results-dir "/workspace/${EVAL_RESULT_DIR}" \ + --task "${EVAL_TASK:-gsm8k}" \ + --framework "${FRAMEWORK}" \ + --precision "${PRECISION}" \ + --tp "${TP:-1}" \ + --ep "${EP_SIZE:-1}" \ + --dp-attention "${DP_ATTENTION:-false}" \ + >> "$GITHUB_STEP_SUMMARY" || true +fi + +echo "Evaluation completed. Results in /workspace/${EVAL_RESULT_DIR}" +exit 0 + + + diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh index f15e6261c..d593eb361 100644 --- a/benchmarks/gptoss_fp4_mi325x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh @@ -13,7 +13,6 @@ # CONC # RESULT_FILENAME - echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" hf download $MODEL @@ -35,6 +34,8 @@ export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 +# +## Start up vllm server set -x vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ diff --git a/benchmarks/gptoss_fp4_mi355x_slurm.sh b/benchmarks/gptoss_fp4_mi355x_slurm.sh index a2adf2952..d21720add 100644 --- a/benchmarks/gptoss_fp4_mi355x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi355x_slurm.sh @@ -26,6 +26,8 @@ export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_FUSED_MOE_A16W4=1 +# +## Start up vllm server set -x vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ From 173d7bf46f1b83ba94eba2794e34fbb1fe3dea63 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 15 Nov 2025 08:01:06 -0600 Subject: [PATCH 050/214] evals h100-cr --- .../{eval-h100-gms8k.yml => eval-gms8k.yml} | 16 +- benchmarks/benchmark_lib.sh | 210 ++++++++++++++++++ benchmarks/gptoss_fp4_h100_slurm.sh | 4 + benchmarks/gptoss_fp4_mi300x_slurm.sh | 24 -- 4 files changed, 222 insertions(+), 32 deletions(-) rename .github/workflows/{eval-h100-gms8k.yml => eval-gms8k.yml} (76%) diff --git a/.github/workflows/eval-h100-gms8k.yml b/.github/workflows/eval-gms8k.yml similarity index 76% rename from .github/workflows/eval-h100-gms8k.yml rename to .github/workflows/eval-gms8k.yml index 84723f596..1822ba209 100644 --- a/.github/workflows/eval-h100-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -35,21 +35,21 @@ on: default: "200" push: paths: - - '.github/workflows/eval-h100-gms8k.yml' + - '.github/workflows/eval-gms8k.yml' - '.github/workflows/eval-tmpl.yml' - - 'benchmarks/dsr1_fp8_mi325x_slurm.sh' + - 'benchmarks/gptoss_fp4_h100_slurm.sh' jobs: eval: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: mi325x-tw_1 - image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' }} - model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }} - framework: sglang - precision: fp8 - exp-name: dsr1_gsm8k_poc + runner: h100-cr_1 + image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} + model: ${{ inputs.model || 'openai/gpt-oss-120b' }} + framework: vllm + precision: fp4 + exp-name: gptoss_gsm8k_poc tp: ${{ inputs.tp || '4' }} ep: '1' dp-attn: false diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index cc3448d40..7c361c649 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -212,3 +212,213 @@ run_benchmark_serving() { --result-filename "$result_filename.json" set +x } + + +# ------------------------------ +# Eval (lm-eval-harness) helpers +# ------------------------------ + +# Ensure bench_serving repo is available for helper utilities (e.g., md summary) +_ensure_bench_serving_repo() { + set +x + git config --global --add safe.directory /workspace || true + if [[ ! -d bench_serving ]]; then + git clone https://github.com/kimbochen/bench_serving.git || true + fi +} + +# Install or update lm-eval dependencies +_install_lm_eval_deps() { + set +x + python3 -m pip install -q --no-cache-dir "lm-eval[api]" || true + # Temporary: workaround known harness issue by using main + python3 -m pip install -q --no-cache-dir --no-deps \ + "git+https://github.com/EleutherAI/lm-evaluation-harness.git@main" || true +} + +# Patch lm-eval filters to be robust to empty strings via sitecustomize +_patch_lm_eval_filters() { + set +x + local patch_dir + patch_dir="$(mktemp -d)" + cat > "$patch_dir/sitecustomize.py" <<'PY' +import re, sys, unicodedata +from lm_eval.filters import extraction as ex + +def _s(x): # coerce to str + return x if isinstance(x, str) else "" + +# --- Patch RegexFilter.apply (used by many datasets) --- +_orig_regex_apply = ex.RegexFilter.apply +def _safe_regex_apply(self, resps, docs): + out = [] + for inst in resps: # inst is a list of candidate responses for one doc + filtered = [] + for resp in inst: + txt = _s(resp) + m = self.regex.findall(txt) + if m: + m = m[self.group_select] + if isinstance(m, tuple): + m = [t for t in m if t] + m = m[0] if m else self.fallback + m = m.strip() + else: + m = self.fallback + filtered.append(m) + out.append(filtered) + return out +ex.RegexFilter.apply = _safe_regex_apply + +# --- Patch MultiChoiceRegexFilter.apply (used by GSM8K flexible-extract) --- +_orig_mc_apply = ex.MultiChoiceRegexFilter.apply +def _safe_mc_apply(self, resps, docs): + def find_match(regex, resp, convert_dict={}): + txt = _s(resp) + match = regex.findall(txt) + if match: + match = match[self.group_select] + if isinstance(match, tuple): + match = [m for m in match if m] + if match: + match = match[0] + if match: + match = match.strip() + if match in convert_dict: + return convert_dict[match] + return match + return None + + punct_tbl = dict.fromkeys( + i for i in range(sys.maxunicode) + if unicodedata.category(chr(i)).startswith("P") + ) + + def filter_ignores(st): + st = _s(st) + if self.regexes_to_ignore is not None: + for s in self.regexes_to_ignore: + st = re.sub(s, "", st) + if self.ignore_case: + st = st.lower() + if self.ignore_punctuation: + st = st.translate(punct_tbl) + return st + + out = [] + for r, doc in zip(resps, docs): + # Build fallback regexes from choices (A, B, C, ...) as in upstream + fallback_regexes, choice_to_alpha = [], {} + next_alpha = "A" + without_paren, without_paren_to_target = [], {} + for c in doc.get("choices", []): + m = filter_ignores(c.strip()) + fallback_regexes.append(re.escape(m)) + choice_to_alpha[m] = f"({next_alpha})" + without_paren.append(next_alpha) + without_paren_to_target[next_alpha] = f"({next_alpha})" + next_alpha = chr(ord(next_alpha) + 1) + + fallback_regex = re.compile("|".join(fallback_regexes)) if fallback_regexes else None + without_paren_regex = re.compile(rf":[\s]*({'|'.join(without_paren)})") if without_paren else None + + filtered = [] + for resp in r: + m = find_match(self.regex, resp) + if not m and fallback_regex: + m = find_match(fallback_regex, filter_ignores(resp), choice_to_alpha) + if not m and without_paren_regex: + m = find_match(without_paren_regex, resp, without_paren_to_target) + if not m: + m = self.fallback + filtered.append(m) + out.append(filtered) + return out + +ex.MultiChoiceRegexFilter.apply = _safe_mc_apply +PY + export PYTHONPATH="${patch_dir}:${PYTHONPATH:-}" +} + +# Run an lm-eval-harness task against a local OpenAI-compatible server +# Parameters: +# --port: Server port (default: $PORT or 8888) +# --task: Eval task (default: $EVAL_TASK or gsm8k) +# --num-fewshot: Fewshot k (default: $NUM_FEWSHOT or 5) +# --results-dir: Output dir (default: $EVAL_RESULT_DIR or eval_out) +# --batch-size: Harness batch size (default: 2) +# --gen-max-tokens: Max tokens for generation (default: 8192) +# --temperature: Temperature (default: 0) +# --top-p: Top-p (default: 1) +run_lm_eval() { + set +x + local port="${PORT:-8888}" + local task="${EVAL_TASK:-gsm8k}" + local num_fewshot="${NUM_FEWSHOT:-5}" + local results_dir="${EVAL_RESULT_DIR:-eval_out}" + local batch_size=2 + local gen_max_tokens=8192 + local temperature=0 + local top_p=1 + + # Parse arguments + while [[ $# -gt 0 ]]; do + case $1 in + --port) + port="$2"; shift 2;; + --task) + task="$2"; shift 2;; + --num-fewshot) + num_fewshot="$2"; shift 2;; + --results-dir) + results_dir="$2"; shift 2;; + --batch-size) + batch_size="$2"; shift 2;; + --gen-max-tokens) + gen_max_tokens="$2"; shift 2;; + --temperature) + temperature="$2"; shift 2;; + --top-p) + top_p="$2"; shift 2;; + *) + echo "Unknown parameter: $1"; return 1;; + esac + done + + _ensure_bench_serving_repo + _install_lm_eval_deps + _patch_lm_eval_filters + + local openai_server_base="http://0.0.0.0:${port}" + local openai_chat_base="$openai_server_base/v1/chat/completions" + export OPENAI_API_KEY=${OPENAI_API_KEY:-EMPTY} + + set -x + python3 -m lm_eval --model local-chat-completions --apply_chat_template \ + --tasks "${task}" \ + --num_fewshot "${num_fewshot}" \ + --batch_size "${batch_size}" \ + --output_path "/workspace/${results_dir}" \ + --model_args "model=${MODEL},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=,max_retries=3,num_concurrent=32,tokenized_requests=False" \ + --gen_kwargs "max_tokens=${gen_max_tokens},temperature=${temperature},top_p=${top_p}" + set +x +} + +# Append a Markdown summary to GitHub step summary (no-op if not in GH Actions) +append_lm_eval_summary() { + set +x + local results_dir="${EVAL_RESULT_DIR:-eval_out}" + local task="${EVAL_TASK:-gsm8k}" + if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then + _ensure_bench_serving_repo + python3 bench_serving/lm_eval_to_md.py \ + --results-dir "/workspace/${results_dir}" \ + --task "${task}" \ + --framework "${FRAMEWORK}" \ + --precision "${PRECISION}" \ + --tp "${TP:-1}" \ + --ep "${EP_SIZE:-1}" \ + --dp-attention "${DP_ATTENTION:-false}" \ + >> "$GITHUB_STEP_SUMMARY" || true + fi +} diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh index 82fad1dd7..b463a8aaf 100644 --- a/benchmarks/gptoss_fp4_h100_slurm.sh +++ b/benchmarks/gptoss_fp4_h100_slurm.sh @@ -58,3 +58,7 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation (defaults to GSM8K) +run_lm_eval --port "$PORT" +append_lm_eval_summary diff --git a/benchmarks/gptoss_fp4_mi300x_slurm.sh b/benchmarks/gptoss_fp4_mi300x_slurm.sh index eadfa16b3..eaacf6b1b 100644 --- a/benchmarks/gptoss_fp4_mi300x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi300x_slurm.sh @@ -85,30 +85,6 @@ python3 -m pip install -q --no-cache-dir "lm-eval[api]" || true # Temporary: workaround known harness issue python3 -m pip install -q --no-cache-dir --no-deps "git+https://github.com/EleutherAI/lm-evaluation-harness.git@main" || true -# -## Wait for vllm server to start up -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - -# -## Run benchmark -set -x -python3 bench_serving/benchmark_serving.py \ ---model $MODEL \ ---backend vllm \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json -set +x ####### From 4ff8a9b2b8ad0b7651e4c68ab67ee8323319c687 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 15 Nov 2025 08:21:41 -0600 Subject: [PATCH 051/214] evals h100-cw --- .github/workflows/eval-gms8k.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 1822ba209..92919e356 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -44,7 +44,7 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: h100-cr_1 + runner: h100-cw_1 image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm From 83901e7427bbd4c6cc46de9e26eb80dbdcff9fd0 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 15 Nov 2025 14:02:45 -0600 Subject: [PATCH 052/214] evals h200-nb --- .github/workflows/eval-gms8k.yml | 4 ++-- .github/workflows/eval-tmpl.yml | 2 +- benchmarks/gptoss_fp4_h200_slurm.sh | 4 ++++ 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 92919e356..752c92ce2 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -37,14 +37,14 @@ on: paths: - '.github/workflows/eval-gms8k.yml' - '.github/workflows/eval-tmpl.yml' - - 'benchmarks/gptoss_fp4_h100_slurm.sh' + - 'benchmarks/gptoss_fp4_h200_slurm.sh' jobs: eval: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: h100-cw_1 + runner: h200-nb_1 image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml index 9c4e77c78..7b42a7853 100644 --- a/.github/workflows/eval-tmpl.yml +++ b/.github/workflows/eval-tmpl.yml @@ -65,7 +65,7 @@ env: LIMIT: ${{ inputs.limit }} EVAL_RESULT_DIR: eval_out # Server-side concurrency default (used by some server scripts) - CONC: '32' + CONC: '16' MAX_MODEL_LEN: '8192' ISL: 1024 OSL: 8192 diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh index f87361ffd..2c18d4d6a 100644 --- a/benchmarks/gptoss_fp4_h200_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_slurm.sh @@ -67,3 +67,7 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation (defaults to GSM8K) +run_lm_eval --port "$PORT" +append_lm_eval_summary \ No newline at end of file From 6c65a24e646efd17dd0a16fc11c2616721a7913e Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 15 Nov 2025 14:26:20 -0600 Subject: [PATCH 053/214] move eval script here --- .github/workflows/eval-gms8k.yml | 1 + benchmarks/benchmark_lib.sh | 15 +--- utils/lm_eval_to_md.py | 137 +++++++++++++++++++++++++++++++ 3 files changed, 140 insertions(+), 13 deletions(-) create mode 100644 utils/lm_eval_to_md.py diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 752c92ce2..23e2be4bb 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -38,6 +38,7 @@ on: - '.github/workflows/eval-gms8k.yml' - '.github/workflows/eval-tmpl.yml' - 'benchmarks/gptoss_fp4_h200_slurm.sh' + - 'benchmarks/benchmark_lib.sh' jobs: eval: diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 7c361c649..3a9df22bc 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -218,15 +218,6 @@ run_benchmark_serving() { # Eval (lm-eval-harness) helpers # ------------------------------ -# Ensure bench_serving repo is available for helper utilities (e.g., md summary) -_ensure_bench_serving_repo() { - set +x - git config --global --add safe.directory /workspace || true - if [[ ! -d bench_serving ]]; then - git clone https://github.com/kimbochen/bench_serving.git || true - fi -} - # Install or update lm-eval dependencies _install_lm_eval_deps() { set +x @@ -384,8 +375,7 @@ run_lm_eval() { echo "Unknown parameter: $1"; return 1;; esac done - - _ensure_bench_serving_repo + _install_lm_eval_deps _patch_lm_eval_filters @@ -410,8 +400,7 @@ append_lm_eval_summary() { local results_dir="${EVAL_RESULT_DIR:-eval_out}" local task="${EVAL_TASK:-gsm8k}" if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then - _ensure_bench_serving_repo - python3 bench_serving/lm_eval_to_md.py \ + python3 utils/lm_eval_to_md.py \ --results-dir "/workspace/${results_dir}" \ --task "${task}" \ --framework "${FRAMEWORK}" \ diff --git a/utils/lm_eval_to_md.py b/utils/lm_eval_to_md.py new file mode 100644 index 000000000..dbcc4d88d --- /dev/null +++ b/utils/lm_eval_to_md.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 +""" +Convert latest lm-evaluation-harness JSON in a results dir into a Markdown table +for GitHub Actions job summary. Prints to stdout. + +Usage: + python3 bench_serving/scripts/lm_eval_to_md.py \ + --results-dir /workspace/eval_out \ + --task gsm8k \ + --framework vLLM \ + --precision fp16 \ + --tp 4 \ + --ep 1 \ + --dp-attention false +""" +import argparse, json, os, re, sys +from collections import Counter +from glob import glob + +def find_latest_json(results_dir: str): + paths = [] + for root, _, _ in os.walk(results_dir): + paths.extend(glob(os.path.join(root, "*.json"))) + if not paths: + return None + paths.sort(key=lambda p: os.path.getmtime(p), reverse=True) + return paths[0] + +def pct(x): + return f"{x*100:.2f}%" if isinstance(x, (int, float)) else "N/A" + +def se(x): + return f" \u00B1{(x*100):.2f}%" if isinstance(x, (int, float)) else "" + +def gpu_cpu_from_pretty_env(pe: str): + if not isinstance(pe, str) or not pe: + return "Unknown GPU" + gpu_lines = [l for l in pe.splitlines() if l.startswith("GPU ")] + names = [re.sub(r"GPU \d+:\s*", "", l).strip() for l in gpu_lines] + c = Counter(names) + gpu_summary = " + ".join([f"{n}\u00D7 {name}" for name, n in c.items()]) if c else "Unknown GPU" + cpu_line = next((l.split(":", 1)[1].strip() for l in pe.splitlines() if l.startswith("Model name:")), None) + return gpu_summary + (f" ({cpu_line})" if cpu_line else "") + +def extract_metrics(data: dict, task: str): + # results section can vary across harness versions + res_all = data.get("results", {}) or {} + res = res_all.get(task) if isinstance(res_all, dict) else {} + if not res and isinstance(res_all, dict) and res_all: + # fallback to first key if requested task missing + any_key = next(iter(res_all.keys())) + res = res_all.get(any_key, {}) + task = any_key + + strict = res.get("exact_match,strict-match") + flex = res.get("exact_match,flexible-extract") + strict_se = res.get("exact_match_stderr,strict-match") + flex_se = res.get("exact_match_stderr,flexible-extract") + + n_eff = None + ns = data.get("n-samples") or data.get("n_samples") or {} + if isinstance(ns, dict): + tdict = ns.get(task) or ns.get("gsm8k") or {} + if isinstance(tdict, dict): + n_eff = tdict.get("effective") or tdict.get("n_eff") + + # model/fewshot/limit are scattered depending on version + model = data.get("model_name") \ + or data.get("configs", {}).get(task, {}).get("metadata", {}).get("model") \ + or data.get("config", {}).get("model") \ + or "" + + # k-shot + fewshot = None + nshot = data.get("n-shot") or data.get("n_shot") or {} + if isinstance(nshot, dict): + fewshot = nshot.get(task) or nshot.get("gsm8k") + + # limit + limit = None + cfg = data.get("config") or {} + if isinstance(cfg, dict): + limit = cfg.get("limit") + + return { + "task": task, + "strict": strict, + "flex": flex, + "strict_se": strict_se, + "flex_se": flex_se, + "n_eff": n_eff, + "model": model, + "fewshot": fewshot, + "limit": limit + } + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--results-dir", required=True) + ap.add_argument("--task", default="gsm8k") + ap.add_argument("--framework", default=os.environ.get("FRAMEWORK", "vLLM")) + ap.add_argument("--precision", default=os.environ.get("PRECISION", "fp16")) + ap.add_argument("--tp", default=os.environ.get("TP", "1")) + ap.add_argument("--ep", default=os.environ.get("EP_SIZE", "1")) + ap.add_argument("--dp-attention", default=os.environ.get("DP_ATTENTION", "false")) + args = ap.parse_args() + + path = find_latest_json(args.results_dir) + print(f"### {args.task} Evaluation\n") + if not path or not os.path.exists(path): + print(f"> No result JSON found in `{args.results_dir}`.") + return + + with open(path, "r") as f: + data = json.load(f) + + hardware = gpu_cpu_from_pretty_env(data.get("pretty_env_info", "")) + m = extract_metrics(data, args.task) + + print("| Hardware | Framework | Precision | TP | EP | DP Attention | EM Strict | EM Flexible | N (eff) |") + print("|---|---|---:|--:|--:|:--:|--:|--:|--:|") + print(f"| {hardware} | {args.framework} | {args.precision} | {args.tp} | {args.ep} | {str(args.dp_attention).lower()} | " + f"{pct(m['strict'])}{se(m['strict_se'])} | {pct(m['flex'])}{se(m['flex_se'])} | {m['n_eff'] or ''} |") + + # metadata line + lim = m["limit"] + lim_str = str(int(lim)) if isinstance(lim, (int, float)) else (str(lim) if lim is not None else "") + fewshot = m["fewshot"] if m["fewshot"] is not None else "" + print(f"\n_Model_: `{m['model']}`    _k-shot_: **{fewshot}**    _limit_: **{lim_str}** \n_Source_: `{os.path.basename(path)}`") + +if __name__ == "__main__": + try: + main() + except Exception as e: + # Never blow up the CI summary; emit a helpful line instead. + print(f"> Failed to render evaluation summary: {e}") + sys.exit(0) \ No newline at end of file From 343d24e045be0888a7e11ab624223d6c3d8c9771 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 15 Nov 2025 15:33:30 -0600 Subject: [PATCH 054/214] evals mi300x-amd --- .github/workflows/eval-gms8k.yml | 6 +- benchmarks/gptoss_fp4_mi300x_docker.sh | 7 +- benchmarks/gptoss_fp4_mi300x_slurm.sh | 159 +------------------------ benchmarks/gptoss_fp4_mi325x_docker.sh | 5 + benchmarks/gptoss_fp4_mi325x_slurm.sh | 5 + benchmarks/gptoss_fp4_mi355x_docker.sh | 5 + benchmarks/gptoss_fp4_mi355x_slurm.sh | 5 + 7 files changed, 32 insertions(+), 160 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 23e2be4bb..a814e0da3 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -37,16 +37,14 @@ on: paths: - '.github/workflows/eval-gms8k.yml' - '.github/workflows/eval-tmpl.yml' - - 'benchmarks/gptoss_fp4_h200_slurm.sh' - - 'benchmarks/benchmark_lib.sh' jobs: eval: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: h200-nb_1 - image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} + runner: mi300x-amd_0 + image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm precision: fp4 diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh index 7d1f98226..63dcf76e1 100644 --- a/benchmarks/gptoss_fp4_mi300x_docker.sh +++ b/benchmarks/gptoss_fp4_mi300x_docker.sh @@ -56,4 +56,9 @@ run_benchmark_serving \ --num-prompts $(( $CONC * 10 )) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ No newline at end of file + --result-dir /workspace/ + +# After throughput, run evaluation (defaults to GSM8K) +run_lm_eval --port "$PORT" +append_lm_eval_summary +set +x \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_mi300x_slurm.sh b/benchmarks/gptoss_fp4_mi300x_slurm.sh index eaacf6b1b..b0ba7db04 100644 --- a/benchmarks/gptoss_fp4_mi300x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi300x_slurm.sh @@ -69,159 +69,8 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ -set +x - -# -## Ensure benching scripts present -git config --global --add safe.directory /workspace || true -if [[ ! -d bench_serving ]]; then - git clone https://github.com/kimbochen/bench_serving.git -fi - -# -## Deps for lm-eval -#python3 -m pip install -q --upgrade pip || true -python3 -m pip install -q --no-cache-dir "lm-eval[api]" || true -# Temporary: workaround known harness issue -python3 -m pip install -q --no-cache-dir --no-deps "git+https://github.com/EleutherAI/lm-evaluation-harness.git@main" || true - - -####### - -# -## Evals setup -# !TODO clean env vars -EVAL_RESULT_DIR=${EVAL_RESULT_DIR:-eval_out} -OPENAI_SERVER_BASE="http://0.0.0.0:${PORT}" -OPENAI_COMP_BASE="$OPENAI_SERVER_BASE/v1/completions" -OPENAI_CHAT_BASE="$OPENAI_SERVER_BASE/v1/chat/completions" -export OPENAI_API_KEY=${OPENAI_API_KEY:-EMPTY} - -# Patch to convert bypass regex error if content field is empty -PATCH_DIR="$(mktemp -d)" -cat > "$PATCH_DIR/sitecustomize.py" <<'PY' -import re, sys, unicodedata -from lm_eval.filters import extraction as ex - -def _s(x): # coerce to str - return x if isinstance(x, str) else "" - -# --- Patch RegexFilter.apply (used by many datasets) --- -_orig_regex_apply = ex.RegexFilter.apply -def _safe_regex_apply(self, resps, docs): - out = [] - for inst in resps: # inst is a list of candidate responses for one doc - filtered = [] - for resp in inst: - txt = _s(resp) - m = self.regex.findall(txt) - if m: - m = m[self.group_select] - if isinstance(m, tuple): - m = [t for t in m if t] - m = m[0] if m else self.fallback - m = m.strip() - else: - m = self.fallback - filtered.append(m) - out.append(filtered) - return out -ex.RegexFilter.apply = _safe_regex_apply - -# --- Patch MultiChoiceRegexFilter.apply (used by GSM8K flexible-extract) --- -_orig_mc_apply = ex.MultiChoiceRegexFilter.apply -def _safe_mc_apply(self, resps, docs): - def find_match(regex, resp, convert_dict={}): - txt = _s(resp) - match = regex.findall(txt) - if match: - match = match[self.group_select] - if isinstance(match, tuple): - match = [m for m in match if m] - if match: - match = match[0] - if match: - match = match.strip() - if match in convert_dict: - return convert_dict[match] - return match - return None - - punct_tbl = dict.fromkeys( - i for i in range(sys.maxunicode) - if unicodedata.category(chr(i)).startswith("P") - ) - - def filter_ignores(st): - st = _s(st) - if self.regexes_to_ignore is not None: - for s in self.regexes_to_ignore: - st = re.sub(s, "", st) - if self.ignore_case: - st = st.lower() - if self.ignore_punctuation: - st = st.translate(punct_tbl) - return st - - out = [] - for r, doc in zip(resps, docs): - # Build fallback regexes from choices (A, B, C, ...) as in upstream - fallback_regexes, choice_to_alpha = [], {} - next_alpha = "A" - without_paren, without_paren_to_target = [], {} - for c in doc.get("choices", []): - m = filter_ignores(c.strip()) - fallback_regexes.append(re.escape(m)) - choice_to_alpha[m] = f"({next_alpha})" - without_paren.append(next_alpha) - without_paren_to_target[next_alpha] = f"({next_alpha})" - next_alpha = chr(ord(next_alpha) + 1) - - fallback_regex = re.compile("|".join(fallback_regexes)) if fallback_regexes else None - without_paren_regex = re.compile(rf":[\s]*({'|'.join(without_paren)})") if without_paren else None - - filtered = [] - for resp in r: - m = find_match(self.regex, resp) - if not m and fallback_regex: - m = find_match(fallback_regex, filter_ignores(resp), choice_to_alpha) - if not m and without_paren_regex: - m = find_match(without_paren_regex, resp, without_paren_to_target) - if not m: - m = self.fallback - filtered.append(m) - out.append(filtered) - return out - -ex.MultiChoiceRegexFilter.apply = _safe_mc_apply -PY - -export PYTHONPATH="${PATCH_DIR}:${PYTHONPATH:-}" -set -x -python3 -m lm_eval --model local-chat-completions --apply_chat_template \ ---tasks ${EVAL_TASK:-gsm8k} \ ---num_fewshot ${NUM_FEWSHOT:-5} \ ---batch_size 2 \ ---output_path "/workspace/${EVAL_RESULT_DIR}" \ ---model_args "model=$MODEL,base_url=$OPENAI_CHAT_BASE,api_key=$OPENAI_API_KEY,eos_string=,max_retries=3,num_concurrent=32,tokenized_requests=False" \ ---gen_kwargs "max_tokens=8192,temperature=0,top_p=1" -set +x - -# Append a Markdown table to the GitHub Actions job summary using helper in bench_serving -if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then -python3 bench_serving/lm_eval_to_md.py \ - --results-dir "/workspace/${EVAL_RESULT_DIR}" \ - --task "${EVAL_TASK:-gsm8k}" \ - --framework "${FRAMEWORK}" \ - --precision "${PRECISION}" \ - --tp "${TP:-1}" \ - --ep "${EP_SIZE:-1}" \ - --dp-attention "${DP_ATTENTION:-false}" \ - >> "$GITHUB_STEP_SUMMARY" || true -fi - -echo "Evaluation completed. Results in /workspace/${EVAL_RESULT_DIR}" -exit 0 - - +# After throughput, run evaluation (defaults to GSM8K) +run_lm_eval --port "$PORT" +append_lm_eval_summary +set +x \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_mi325x_docker.sh b/benchmarks/gptoss_fp4_mi325x_docker.sh index 46462ad6d..ccfe6e1c3 100644 --- a/benchmarks/gptoss_fp4_mi325x_docker.sh +++ b/benchmarks/gptoss_fp4_mi325x_docker.sh @@ -56,3 +56,8 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation (defaults to GSM8K) +run_lm_eval --port "$PORT" +append_lm_eval_summary +set +x \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh index d593eb361..4219d0662 100644 --- a/benchmarks/gptoss_fp4_mi325x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh @@ -68,3 +68,8 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation (defaults to GSM8K) +run_lm_eval --port "$PORT" +append_lm_eval_summary +set +x \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh index 0e54245d4..f63cc9960 100644 --- a/benchmarks/gptoss_fp4_mi355x_docker.sh +++ b/benchmarks/gptoss_fp4_mi355x_docker.sh @@ -55,3 +55,8 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation (defaults to GSM8K) +run_lm_eval --port "$PORT" +append_lm_eval_summary +set +x \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_mi355x_slurm.sh b/benchmarks/gptoss_fp4_mi355x_slurm.sh index d21720add..0dd860bb1 100644 --- a/benchmarks/gptoss_fp4_mi355x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi355x_slurm.sh @@ -59,3 +59,8 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation (defaults to GSM8K) +run_lm_eval --port "$PORT" +append_lm_eval_summary +set +x \ No newline at end of file From 2de4a18a712acc85f1f2fe1c917684b03f0a61d2 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 15 Nov 2025 15:36:32 -0600 Subject: [PATCH 055/214] evals mi325x-amd --- .github/workflows/eval-gms8k.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index a814e0da3..3b0f005ae 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -43,7 +43,7 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: mi300x-amd_0 + runner: mi325x-amd_0 image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm From 21825ce02174139a1647f925a86901db1607c5a7 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 15 Nov 2025 15:44:22 -0600 Subject: [PATCH 056/214] evals mi300x-tw --- .github/workflows/eval-gms8k.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 3b0f005ae..beaaa13dd 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -43,7 +43,7 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: mi325x-amd_0 + runner: mi300x-tw_0 image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm From 00bfa341656c98f56287eade039a9ca126751d33 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 15 Nov 2025 15:46:28 -0600 Subject: [PATCH 057/214] evals mi300x-oci --- .github/workflows/eval-gms8k.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index beaaa13dd..c74b7fe8a 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -43,7 +43,7 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: mi300x-tw_0 + runner: mi300x-oci_0 image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm From e8aa07eed7e6fddcbbaa2fb8ebd884c3dc81d150 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 15 Nov 2025 15:47:04 -0600 Subject: [PATCH 058/214] evals mi325x-tw --- .github/workflows/eval-gms8k.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index c74b7fe8a..2a0308f6f 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -43,7 +43,7 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: mi300x-oci_0 + runner: mi325x-tw_0 image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm From bf4eff20534720e964bb64bc6a2cb1abc9b5e848 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 15 Nov 2025 16:14:16 -0600 Subject: [PATCH 059/214] evals mi325x-tw summary --- benchmarks/benchmark_lib.sh | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 3a9df22bc..582358279 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -399,15 +399,34 @@ append_lm_eval_summary() { set +x local results_dir="${EVAL_RESULT_DIR:-eval_out}" local task="${EVAL_TASK:-gsm8k}" - if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then - python3 utils/lm_eval_to_md.py \ + # Render markdown once, then decide where to write it to avoid redirection errors + local md_out + md_out=$(python3 utils/lm_eval_to_md.py \ --results-dir "/workspace/${results_dir}" \ --task "${task}" \ --framework "${FRAMEWORK}" \ --precision "${PRECISION}" \ --tp "${TP:-1}" \ --ep "${EP_SIZE:-1}" \ - --dp-attention "${DP_ATTENTION:-false}" \ - >> "$GITHUB_STEP_SUMMARY" || true + --dp-attention "${DP_ATTENTION:-false}" 2>/dev/null || true) + + # If nothing was produced, nothing to append + if [ -z "${md_out}" ]; then + return 0 fi + + # Prefer GitHub step summary when available and path is valid; otherwise fallback to workspace file + if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then + local _gh_path="$GITHUB_STEP_SUMMARY" + local _gh_dir + _gh_dir="$(dirname "$_gh_path")" + if [ -d "$_gh_dir" ]; then + printf "%s\n" "${md_out}" >> "$_gh_path" || true + return 0 + fi + fi + + # Fallback: write to a summary file alongside results + mkdir -p "/workspace/${results_dir}" 2>/dev/null || true + printf "%s\n" "${md_out}" >> "/workspace/${results_dir}/SUMMARY.md" || true } From 71008bb68d19e0e87eed1612ad81c254e20b9f90 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 15 Nov 2025 16:16:04 -0600 Subject: [PATCH 060/214] evals mi325x-tw summary --- .github/workflows/eval-gms8k.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 2a0308f6f..35bd4751a 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -43,7 +43,7 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: mi325x-tw_0 + runner: mi325x-tw_1 image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm From 7f3cd094551525bc50a315a1e2c705b021425cc7 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 15 Nov 2025 16:25:23 -0600 Subject: [PATCH 061/214] evals mi355x-amd --- .github/workflows/eval-gms8k.yml | 2 +- .github/workflows/eval-tmpl.yml | 13 +++++ benchmarks/benchmark_lib.sh | 84 +++++++++++++++++++++++++++++--- 3 files changed, 92 insertions(+), 7 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 35bd4751a..188b36546 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -43,7 +43,7 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: mi325x-tw_1 + runner: mi355x-amd_4 image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml index 7b42a7853..2f2ccd621 100644 --- a/.github/workflows/eval-tmpl.yml +++ b/.github/workflows/eval-tmpl.yml @@ -125,6 +125,19 @@ jobs: run: | bash ./runners/launch_${RUNNER_NAME%%_*}.sh + - name: Append eval summary + if: always() + shell: bash + run: | + # If the compute node couldn't write to $GITHUB_STEP_SUMMARY directly, + # our scripts wrote a fallback markdown to ${EVAL_RESULT_DIR}/SUMMARY.md. + if [[ -f "${{ env.EVAL_RESULT_DIR }}/SUMMARY.md" ]]; then + echo "Appending evaluation summary to GitHub step summary" + cat "${{ env.EVAL_RESULT_DIR }}/SUMMARY.md" >> "$GITHUB_STEP_SUMMARY" + else + echo "No fallback summary found at '${{ env.EVAL_RESULT_DIR }}/SUMMARY.md'" + fi + - name: Upload eval artifacts if: always() uses: actions/upload-artifact@v5 diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 582358279..52e6cdbf8 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -233,17 +233,22 @@ _patch_lm_eval_filters() { local patch_dir patch_dir="$(mktemp -d)" cat > "$patch_dir/sitecustomize.py" <<'PY' -import re, sys, unicodedata +# sitecustomize.py — loaded automatically by Python if on PYTHONPATH +import re, sys, unicodedata, types + +# ----------------------------- +# 1) Safe regex filters (yours) +# ----------------------------- from lm_eval.filters import extraction as ex def _s(x): # coerce to str return x if isinstance(x, str) else "" -# --- Patch RegexFilter.apply (used by many datasets) --- +# --- RegexFilter.apply --- _orig_regex_apply = ex.RegexFilter.apply def _safe_regex_apply(self, resps, docs): out = [] - for inst in resps: # inst is a list of candidate responses for one doc + for inst in resps: # list of candidates for one doc filtered = [] for resp in inst: txt = _s(resp) @@ -261,7 +266,7 @@ def _safe_regex_apply(self, resps, docs): return out ex.RegexFilter.apply = _safe_regex_apply -# --- Patch MultiChoiceRegexFilter.apply (used by GSM8K flexible-extract) --- +# --- MultiChoiceRegexFilter.apply (used by GSM8K flexible-extract) --- _orig_mc_apply = ex.MultiChoiceRegexFilter.apply def _safe_mc_apply(self, resps, docs): def find_match(regex, resp, convert_dict={}): @@ -298,7 +303,7 @@ def _safe_mc_apply(self, resps, docs): out = [] for r, doc in zip(resps, docs): - # Build fallback regexes from choices (A, B, C, ...) as in upstream + # Build fallback regexes from choices (A, B, C, ...) as upstream fallback_regexes, choice_to_alpha = [], {} next_alpha = "A" without_paren, without_paren_to_target = [], {} @@ -325,8 +330,75 @@ def _safe_mc_apply(self, resps, docs): filtered.append(m) out.append(filtered) return out - ex.MultiChoiceRegexFilter.apply = _safe_mc_apply + +# ----------------------------------------------------- +# 2) Fallback to reasoning_content in parse_generations +# ----------------------------------------------------- +# For OpenAI-like chat completions, some servers return: +# choices[0].message.content == None +# choices[0].message.reasoning_content == "" +# If so, return reasoning_content instead of None; if both missing, return "". + +from lm_eval.models.api_models import TemplateAPI + +def _wrap_parse_generations_on_class(cls): + if not hasattr(cls, "parse_generations"): + return + orig = cls.parse_generations + # parse_generations is a @staticmethod on API models; preserve staticmethod + def wrapped(*, outputs, **kwargs): + # First, run the original + res = orig(outputs=outputs, **kwargs) + # Normalize to list for convenience + if isinstance(res, (str, type(None))): + res = [res] + outputs_list = [outputs] + else: + outputs_list = outputs if isinstance(outputs, list) else [outputs] + + def _fallback_from_output(o): + try: + # OpenAI-style: dict -> choices[0] -> message + ch0 = (o or {}).get("choices", [{}])[0] + msg = ch0.get("message", {}) or {} + txt = msg.get("content") + if txt is None: + # Newer servers may use reasoning_content + txt = msg.get("reasoning_content") + if txt is None: + # Some servers put it at choices[0].reasoning.content + txt = (ch0.get("reasoning") or {}).get("content") + return "" if txt is None else txt + except Exception: + return "" + fb = [_fallback_from_output(o) for o in outputs_list] + + # Replace None/empty only if a fallback exists + res_out = [] + for i, v in enumerate(res): + if (v is None or v == "") and i < len(fb) and fb[i]: + res_out.append(fb[i]) + else: + # still coerce None -> "" so downstream filters never see None + res_out.append("" if v is None else v) + return res_out + + # Rebind as staticmethod to match original decoration + cls.parse_generations = staticmethod(wrapped) + +# Try to patch common OpenAI-like chat backends +try: + from lm_eval.models import openai_like as oli + for name in dir(oli): + obj = getattr(oli, name) + if isinstance(obj, type) and issubclass(obj, TemplateAPI): + # Heuristically target chat-style classes only + if "Chat" in obj.__name__ or "OpenAI" in obj.__name__: + _wrap_parse_generations_on_class(obj) +except Exception: + # If module layout changes, fail soft; your regex guards still protect filters. + pass PY export PYTHONPATH="${patch_dir}:${PYTHONPATH:-}" } From dfff2f44ded4e03c88909ea2e3aec9f3e823a011 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 15 Nov 2025 16:43:41 -0600 Subject: [PATCH 062/214] evals mi325x-tw summary --- .github/workflows/eval-gms8k.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 188b36546..35bd4751a 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -43,7 +43,7 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: mi355x-amd_4 + runner: mi325x-tw_1 image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm From 9a11152d612a7964e903806cbb989cb9e860a488 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 15 Nov 2025 17:06:59 -0600 Subject: [PATCH 063/214] evals mi325x-tw summary --- .github/workflows/eval-tmpl.yml | 13 ------------- runners/launch_mi300x-amd.sh | 11 +++++++++++ runners/launch_mi325x-amd.sh | 10 ++++++++++ runners/launch_mi355x-amd.sh | 11 +++++++++++ 4 files changed, 32 insertions(+), 13 deletions(-) diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml index 2f2ccd621..7b42a7853 100644 --- a/.github/workflows/eval-tmpl.yml +++ b/.github/workflows/eval-tmpl.yml @@ -125,19 +125,6 @@ jobs: run: | bash ./runners/launch_${RUNNER_NAME%%_*}.sh - - name: Append eval summary - if: always() - shell: bash - run: | - # If the compute node couldn't write to $GITHUB_STEP_SUMMARY directly, - # our scripts wrote a fallback markdown to ${EVAL_RESULT_DIR}/SUMMARY.md. - if [[ -f "${{ env.EVAL_RESULT_DIR }}/SUMMARY.md" ]]; then - echo "Appending evaluation summary to GitHub step summary" - cat "${{ env.EVAL_RESULT_DIR }}/SUMMARY.md" >> "$GITHUB_STEP_SUMMARY" - else - echo "No fallback summary found at '${{ env.EVAL_RESULT_DIR }}/SUMMARY.md'" - fi - - name: Upload eval artifacts if: always() uses: actions/upload-artifact@v5 diff --git a/runners/launch_mi300x-amd.sh b/runners/launch_mi300x-amd.sh index 780e5a2f0..85fa1f8c7 100644 --- a/runners/launch_mi300x-amd.sh +++ b/runners/launch_mi300x-amd.sh @@ -8,6 +8,16 @@ PORT=8888 server_name="bmk-server" set -x +# Propagate GitHub summary file into the container when available +GH_SUM_ENV="" +GH_SUM_MOUNT="" +if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then + GH_SUM_ENV="-e GITHUB_STEP_SUMMARY=${GITHUB_STEP_SUMMARY}" + GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")" + if [ -d "${GH_SUM_DIR}" ]; then + GH_SUM_MOUNT="-v ${GH_SUM_DIR}:${GH_SUM_DIR}" + fi +fi docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ @@ -15,6 +25,7 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \ -e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \ +${GH_SUM_ENV} ${GH_SUM_MOUNT} \ --entrypoint=/bin/bash \ $IMAGE \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi300x_docker.sh" diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh index 1065167d7..008e42577 100644 --- a/runners/launch_mi325x-amd.sh +++ b/runners/launch_mi325x-amd.sh @@ -22,3 +22,13 @@ srun --jobid=$JOB_ID \ bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_mi325x_slurm.sh scancel $JOB_ID + +# Append eval summary within this same step when available +if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then + GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")" + if [ -d "${GH_SUM_DIR}" ]; then + if [ -f "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" ]; then + cat "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" >> "${GITHUB_STEP_SUMMARY}" || true + fi + fi +fi diff --git a/runners/launch_mi355x-amd.sh b/runners/launch_mi355x-amd.sh index 5f3cbb290..b1b11ff95 100644 --- a/runners/launch_mi355x-amd.sh +++ b/runners/launch_mi355x-amd.sh @@ -30,6 +30,16 @@ else fi set -x +# Propagate GitHub summary file into the container when available +GH_SUM_ENV="" +GH_SUM_MOUNT="" +if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then + GH_SUM_ENV="-e GITHUB_STEP_SUMMARY=${GITHUB_STEP_SUMMARY}" + GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")" + if [ -d "${GH_SUM_DIR}" ]; then + GH_SUM_MOUNT="-v ${GH_SUM_DIR}:${GH_SUM_DIR}" + fi +fi docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ @@ -37,6 +47,7 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT -e NUM_PROMPTS \ -e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \ +${GH_SUM_ENV} ${GH_SUM_MOUNT} \ --entrypoint=/bin/bash \ $IMAGE \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi355x_docker.sh" From 1ead6959b4a946fa5f0538f381ced78527871f07 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 15 Nov 2025 17:18:11 -0600 Subject: [PATCH 064/214] evals mi325x-tw summary --- .github/workflows/eval-tmpl.yml | 2 +- runners/launch_mi325x-amd.sh | 1 + runners/launch_mi325x-tw.sh | 11 +++++++++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml index 7b42a7853..c2363540a 100644 --- a/.github/workflows/eval-tmpl.yml +++ b/.github/workflows/eval-tmpl.yml @@ -68,7 +68,7 @@ env: CONC: '16' MAX_MODEL_LEN: '8192' ISL: 1024 - OSL: 8192 + OSL: 1024 RANDOM_RANGE_RATIO: '1.0' RESULT_FILENAME: results diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh index 008e42577..68affc9a1 100644 --- a/runners/launch_mi325x-amd.sh +++ b/runners/launch_mi325x-amd.sh @@ -11,6 +11,7 @@ salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=256 --time=180 --no JOB_ID=$(squeue -u $USER -h -o %A | head -n1) srun --jobid=$JOB_ID bash -c "sudo enroot import -o $SQUASH_FILE docker://$IMAGE" + srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ diff --git a/runners/launch_mi325x-tw.sh b/runners/launch_mi325x-tw.sh index 488ce6ceb..aa87a424d 100644 --- a/runners/launch_mi325x-tw.sh +++ b/runners/launch_mi325x-tw.sh @@ -11,6 +11,7 @@ salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=128 --time=180 --no JOB_ID=$(squeue -u $USER -h -o %A | head -n1) srun --jobid=$JOB_ID bash -c "sudo enroot import -o $SQUASH_FILE docker://$IMAGE" + srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ @@ -22,3 +23,13 @@ srun --jobid=$JOB_ID \ bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_mi325x_slurm.sh scancel $JOB_ID + +# Fallback: append summary after job completes if container couldn't write directly +if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then + GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")" + if [ -d "${GH_SUM_DIR}" ]; then + if [ -f "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" ]; then + cat "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" >> "${GITHUB_STEP_SUMMARY}" || true + fi + fi +fi From 348d5d9dbdcc6441776a077ffdb94f7e52ccb438 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 15 Nov 2025 17:33:20 -0600 Subject: [PATCH 065/214] all summary --- runners/launch_b200-nb.sh | 12 +++++++++++- runners/launch_b200-nv.sh | 10 ++++++++++ runners/launch_b200-nvd.sh | 12 ++++++++++++ runners/launch_b200-tg.sh | 12 ++++++++++++ runners/launch_h100-cr.sh | 11 +++++++++++ runners/launch_h100-cw.sh | 10 ++++++++++ runners/launch_h200-cw.sh | 10 ++++++++++ runners/launch_h200-nb.sh | 10 ++++++++++ runners/launch_h200-nv.sh | 10 ++++++++++ runners/launch_mi300x-cr.sh | 11 +++++++++++ runners/launch_mi300x-oci.sh | 11 +++++++++++ 11 files changed, 118 insertions(+), 1 deletion(-) diff --git a/runners/launch_b200-nb.sh b/runners/launch_b200-nb.sh index ecd1466dd..1502d0268 100644 --- a/runners/launch_b200-nb.sh +++ b/runners/launch_b200-nb.sh @@ -14,4 +14,14 @@ srun --partition=$PARTITION --gres=gpu:$TP --exclusive \ --no-container-entrypoint --export=ALL,PORT_OFFSET=${USER: -1} \ bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_slurm.sh -scancel $JOB_ID \ No newline at end of file +scancel $JOB_ID + +# Append eval summary within this same step when available +if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then + GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")" + if [ -d "${GH_SUM_DIR}" ]; then + if [ -f "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" ]; then + cat "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" >> "${GITHUB_STEP_SUMMARY}" || true + fi + fi +fi diff --git a/runners/launch_b200-nv.sh b/runners/launch_b200-nv.sh index 243e624f9..8a1afff8e 100644 --- a/runners/launch_b200-nv.sh +++ b/runners/launch_b200-nv.sh @@ -23,3 +23,13 @@ srun --jobid=$JOB_ID \ bash benchmarks/${MODEL_CODE}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_slurm.sh scancel $JOB_ID + +# Append eval summary within this same step when available +if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then + GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")" + if [ -d "${GH_SUM_DIR}" ]; then + if [ -f "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" ]; then + cat "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" >> "${GITHUB_STEP_SUMMARY}" || true + fi + fi +fi diff --git a/runners/launch_b200-nvd.sh b/runners/launch_b200-nvd.sh index c5216b006..12bb66b99 100644 --- a/runners/launch_b200-nvd.sh +++ b/runners/launch_b200-nvd.sh @@ -35,6 +35,17 @@ else export NUM_PROMPTS=$(( CONC * 10 )) fi +## Propagate GitHub summary file into the container when available +GH_SUM_ENV="" +GH_SUM_MOUNT="" +if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then + GH_SUM_ENV="-e GITHUB_STEP_SUMMARY=${GITHUB_STEP_SUMMARY}" + GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")" + if [ -d "${GH_SUM_DIR}" ]; then + GH_SUM_MOUNT="-v ${GH_SUM_DIR}:${GH_SUM_DIR}" + fi +fi + docker run --rm --init --network host --name $server_name \ --runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ @@ -43,6 +54,7 @@ docker run --rm --init --network host --name $server_name \ -e NCCL_GRAPH_REGISTER=0 \ -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e NUM_PROMPTS \ + ${GH_SUM_ENV} ${GH_SUM_MOUNT} \ --entrypoint=/bin/bash \ $(echo "$IMAGE" | sed 's/#/\//') \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh" diff --git a/runners/launch_b200-tg.sh b/runners/launch_b200-tg.sh index 97e975a64..b82e25276 100644 --- a/runners/launch_b200-tg.sh +++ b/runners/launch_b200-tg.sh @@ -7,6 +7,17 @@ PORT=8888 server_name="bmk-server" set -x +## Propagate GitHub summary file into the container when available +GH_SUM_ENV="" +GH_SUM_MOUNT="" +if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then + GH_SUM_ENV="-e GITHUB_STEP_SUMMARY=${GITHUB_STEP_SUMMARY}" + GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")" + if [ -d "${GH_SUM_DIR}" ]; then + GH_SUM_MOUNT="-v ${GH_SUM_DIR}:${GH_SUM_DIR}" + fi +fi + docker run --rm -d --network host --name $server_name \ --runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ @@ -14,6 +25,7 @@ docker run --rm -d --network host --name $server_name \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \ -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ + ${GH_SUM_ENV} ${GH_SUM_MOUNT} \ --entrypoint=/bin/bash \ $(echo "$IMAGE" | sed 's/#/\//') \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh" diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh index d1ddc26de..9815e4884 100644 --- a/runners/launch_h100-cr.sh +++ b/runners/launch_h100-cr.sh @@ -6,12 +6,23 @@ PORT=8888 server_name="bmk-server" set -x +## Propagate GitHub summary file into the container when available +GH_SUM_ENV="" +GH_SUM_MOUNT="" +if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then + GH_SUM_ENV="-e GITHUB_STEP_SUMMARY=${GITHUB_STEP_SUMMARY}" + GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")" + if [ -d "${GH_SUM_DIR}" ]; then + GH_SUM_MOUNT="-v ${GH_SUM_DIR}:${GH_SUM_DIR}" + fi +fi docker run --rm --network=host --name=$server_name \ --runtime=nvidia --gpus=all --ipc=host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \ -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ + ${GH_SUM_ENV} ${GH_SUM_MOUNT} \ --entrypoint=/bin/bash \ $IMAGE \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_h100_docker.sh" diff --git a/runners/launch_h100-cw.sh b/runners/launch_h100-cw.sh index 0179bdd57..864dc9c95 100644 --- a/runners/launch_h100-cw.sh +++ b/runners/launch_h100-cw.sh @@ -18,3 +18,13 @@ srun --jobid=$JOB_ID \ bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_h100_slurm.sh scancel $JOB_ID + +# Append eval summary within this same step when available +if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then + GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")" + if [ -d "${GH_SUM_DIR}" ]; then + if [ -f "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" ]; then + cat "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" >> "${GITHUB_STEP_SUMMARY}" || true + fi + fi +fi diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh index dd4937606..431e027f2 100644 --- a/runners/launch_h200-cw.sh +++ b/runners/launch_h200-cw.sh @@ -30,3 +30,13 @@ srun --jobid=$JOB_ID \ bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}_slurm.sh scancel $JOB_ID + +# Append eval summary within this same step when available +if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then + GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")" + if [ -d "${GH_SUM_DIR}" ]; then + if [ -f "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" ]; then + cat "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" >> "${GITHUB_STEP_SUMMARY}" || true + fi + fi +fi diff --git a/runners/launch_h200-nb.sh b/runners/launch_h200-nb.sh index c76b366d2..19d6e82ba 100644 --- a/runners/launch_h200-nb.sh +++ b/runners/launch_h200-nb.sh @@ -30,3 +30,13 @@ srun --jobid=$JOB_ID \ bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}_slurm.sh scancel $JOB_ID + +# Append eval summary within this same step when available +if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then + GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")" + if [ -d "${GH_SUM_DIR}" ]; then + if [ -f "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" ]; then + cat "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" >> "${GITHUB_STEP_SUMMARY}" || true + fi + fi +fi diff --git a/runners/launch_h200-nv.sh b/runners/launch_h200-nv.sh index 5319f8959..ca2ea6079 100644 --- a/runners/launch_h200-nv.sh +++ b/runners/launch_h200-nv.sh @@ -23,3 +23,13 @@ srun --jobid=$JOB_ID \ bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}_slurm.sh scancel $JOB_ID + +# Append eval summary within this same step when available +if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then + GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")" + if [ -d "${GH_SUM_DIR}" ]; then + if [ -f "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" ]; then + cat "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" >> "${GITHUB_STEP_SUMMARY}" || true + fi + fi +fi diff --git a/runners/launch_mi300x-cr.sh b/runners/launch_mi300x-cr.sh index 8fbdaee63..4c9d56e7e 100644 --- a/runners/launch_mi300x-cr.sh +++ b/runners/launch_mi300x-cr.sh @@ -8,6 +8,16 @@ PORT=8888 server_name="bmk-server" set -x +## Propagate GitHub summary file into the container when available +GH_SUM_ENV="" +GH_SUM_MOUNT="" +if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then + GH_SUM_ENV="-e GITHUB_STEP_SUMMARY=${GITHUB_STEP_SUMMARY}" + GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")" + if [ -d "${GH_SUM_DIR}" ]; then + GH_SUM_MOUNT="-v ${GH_SUM_DIR}:${GH_SUM_DIR}" + fi +fi docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ @@ -15,6 +25,7 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \ -e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \ + ${GH_SUM_ENV} ${GH_SUM_MOUNT} \ --entrypoint=/bin/bash \ $IMAGE \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi300x_docker.sh" diff --git a/runners/launch_mi300x-oci.sh b/runners/launch_mi300x-oci.sh index 33614a03c..f1123d722 100644 --- a/runners/launch_mi300x-oci.sh +++ b/runners/launch_mi300x-oci.sh @@ -6,6 +6,16 @@ PORT=8888 server_name="bmk-server" set -x +## Propagate GitHub summary file into the container when available +GH_SUM_ENV="" +GH_SUM_MOUNT="" +if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then + GH_SUM_ENV="-e GITHUB_STEP_SUMMARY=${GITHUB_STEP_SUMMARY}" + GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")" + if [ -d "${GH_SUM_DIR}" ]; then + GH_SUM_MOUNT="-v ${GH_SUM_DIR}:${GH_SUM_DIR}" + fi +fi docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ @@ -13,6 +23,7 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \ -e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \ + ${GH_SUM_ENV} ${GH_SUM_MOUNT} \ --entrypoint=/bin/bash \ $IMAGE \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi300x_docker.sh" From 679caa66a213db5de9658dd17d04bc8298517f36 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 15 Nov 2025 18:11:05 -0600 Subject: [PATCH 066/214] evals b200-nvd --- .github/workflows/eval-gms8k.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 35bd4751a..3fc810a48 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -43,8 +43,8 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: mi325x-tw_1 - image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }} + runner: b200-nvd_0 + image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm precision: fp4 From eda5e2f46186553bf06c43aafabb763777a470f0 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 15 Nov 2025 22:55:17 -0600 Subject: [PATCH 067/214] evals b200-nvd 2 --- .github/workflows/eval-gms8k.yml | 1 + benchmarks/gptoss_fp4_b200_docker.sh | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 3fc810a48..af33fddbe 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -56,3 +56,4 @@ jobs: eval-task: gsm8k num-fewshot: ${{ inputs.num_fewshot || '5' }} limit: ${{ inputs.limit || '200' }} + diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh index 1736701c4..8b5b6c881 100644 --- a/benchmarks/gptoss_fp4_b200_docker.sh +++ b/benchmarks/gptoss_fp4_b200_docker.sh @@ -71,4 +71,8 @@ run_benchmark_serving \ --num-prompts "$NUM_PROMPTS" \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ No newline at end of file + --result-dir /workspace/ + +# After throughput, run evaluation (defaults to GSM8K) +run_lm_eval --port "$PORT" +append_lm_eval_summary \ No newline at end of file From 42151cc85070b6a268f6848ca4a3e03a88428aff Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 15 Nov 2025 22:59:45 -0600 Subject: [PATCH 068/214] evals b200-nvd 3 --- .github/workflows/eval-gms8k.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index af33fddbe..a5265a57e 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -43,7 +43,7 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: b200-nvd_0 + runner: b200-nvd_3 image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm From 512dfc065ee1ee02a73177648a7e35cc222b9350 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 15 Nov 2025 23:11:30 -0600 Subject: [PATCH 069/214] evals h100-cr --- .github/workflows/eval-gms8k.yml | 2 +- benchmarks/gptoss_fp4_h100_docker.sh | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index a5265a57e..bef7a2d08 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -43,7 +43,7 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: b200-nvd_3 + runner: h100-cr_0 image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index e8f7bdd1d..212059b04 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -57,3 +57,7 @@ run_benchmark_serving \ --max-concurrency 512 \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation (defaults to GSM8K) +run_lm_eval --port "$PORT" +append_lm_eval_summary \ No newline at end of file From 4de631dc5e74d596afc155f008ec65057fbeb6fe Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 15 Nov 2025 23:15:36 -0600 Subject: [PATCH 070/214] evals b200-nvd 1 --- .github/workflows/eval-gms8k.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index bef7a2d08..aa442cdd2 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -43,7 +43,7 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: h100-cr_0 + runner: b200-nvd_1 image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm From b33cb80edc8f1d13975b0b761aa7e67223f5941e Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 15 Nov 2025 23:20:48 -0600 Subject: [PATCH 071/214] evals h200-trt-cw --- .github/workflows/eval-gms8k.yml | 4 ++-- benchmarks/dsr1_fp8_h200_trt_slurm.sh | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index aa442cdd2..e44ed3854 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -43,10 +43,10 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: b200-nvd_1 + runner: h200-cw_0 image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} - framework: vllm + framework: trt precision: fp4 exp-name: gptoss_gsm8k_poc tp: ${{ inputs.tp || '4' }} diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh index 4ece6f7bc..c829e66b5 100644 --- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh @@ -89,3 +89,7 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation (defaults to GSM8K) +run_lm_eval --port "$PORT" +append_lm_eval_summary From 5babdb0811e148b86b0d36c2b557b48a75974bd8 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 15 Nov 2025 23:30:17 -0600 Subject: [PATCH 072/214] evals h200-trt-cw 2 --- .github/workflows/eval-gms8k.yml | 2 +- benchmarks/gptoss_fp4_h200_trt_slurm.sh | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index e44ed3854..8446adaf8 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -44,7 +44,7 @@ jobs: secrets: inherit with: runner: h200-cw_0 - image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} + image: ${{ inputs.image || 'nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-devs' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: trt precision: fp4 diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh index 21d6ae02c..58768e3a2 100644 --- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh @@ -78,3 +78,7 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation (defaults to GSM8K) +run_lm_eval --port "$PORT" +append_lm_eval_summary \ No newline at end of file From 12a85b839c1dafc5087b7dfe8bd35d635c0aaa27 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 15 Nov 2025 23:37:39 -0600 Subject: [PATCH 073/214] evals h200-trt-cw 3 --- .github/workflows/eval-gms8k.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 8446adaf8..3559abf31 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -44,7 +44,7 @@ jobs: secrets: inherit with: runner: h200-cw_0 - image: ${{ inputs.image || 'nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-devs' }} + image: ${{ inputs.image || 'nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: trt precision: fp4 From eb2846facefb4ac9dc7f8da78e57086e7dc7dc7e Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 15 Nov 2025 23:38:59 -0600 Subject: [PATCH 074/214] evals h100-cr 2 --- .github/workflows/eval-gms8k.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 3559abf31..bef7a2d08 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -43,10 +43,10 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: h200-cw_0 - image: ${{ inputs.image || 'nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev' }} + runner: h100-cr_0 + image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} - framework: trt + framework: vllm precision: fp4 exp-name: gptoss_gsm8k_poc tp: ${{ inputs.tp || '4' }} From 41660708ef15e896ff315fa1527707ff9709a55e Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 15 Nov 2025 23:45:07 -0600 Subject: [PATCH 075/214] evals h200-trt-cw 4 --- .github/workflows/eval-gms8k.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index bef7a2d08..2902d6ce0 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -17,7 +17,7 @@ on: description: "Tensor Parallel Size" required: false type: string - default: "4" + default: "1" port: description: "Server port" required: false @@ -43,10 +43,10 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: h100-cr_0 - image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} + runner: h200-cw_0 + image: ${{ inputs.image || 'nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} - framework: vllm + framework: trt precision: fp4 exp-name: gptoss_gsm8k_poc tp: ${{ inputs.tp || '4' }} From 5f6b772d660c9d2d910a6070599809e912777942 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 15 Nov 2025 23:48:41 -0600 Subject: [PATCH 076/214] evals h200-trt-cw 5 (EP/TP HARD) --- .github/workflows/eval-gms8k.yml | 4 ++-- benchmarks/gptoss_fp4_h200_trt_slurm.sh | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 2902d6ce0..b0e90175a 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -1,4 +1,4 @@ -name: Eval - GSM8K on H100 (PoC) +name: Eval - GSM8K (PoC) on: workflow_dispatch: @@ -49,7 +49,7 @@ jobs: framework: trt precision: fp4 exp-name: gptoss_gsm8k_poc - tp: ${{ inputs.tp || '4' }} + tp: ${{ inputs.tp || '1' }} ep: '1' dp-attn: false port: ${{ inputs.port || '8888' }} diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh index 58768e3a2..3c959a7b1 100644 --- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh @@ -50,12 +50,12 @@ trtllm-serve $MODEL \ --max_num_tokens 20000 \ --backend pytorch \ --extra_llm_api_options gptoss-config.yml \ ---ep_size=$EP_SIZE \ +--ep_size=1 \ --trust_remote_code \ --gpus_per_node 8 \ --host 0.0.0.0 \ --port $PORT \ ---tp_size=$TP \ +--tp_size=1 \ --pp_size=1 \ > $SERVER_LOG 2>&1 & From 30baa1f0b52da8c95979cefd5443fa42a1a9bd26 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 16 Nov 2025 00:19:25 -0600 Subject: [PATCH 077/214] evals h200-trt-cw 6 (EP/TP HARD) --- benchmarks/benchmark_lib.sh | 96 ++++++++++++++++++++++++++++++++++++- 1 file changed, 95 insertions(+), 1 deletion(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 52e6cdbf8..e1115a602 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -234,7 +234,101 @@ _patch_lm_eval_filters() { patch_dir="$(mktemp -d)" cat > "$patch_dir/sitecustomize.py" <<'PY' # sitecustomize.py — loaded automatically by Python if on PYTHONPATH -import re, sys, unicodedata, types +import os, re, sys, unicodedata, types + +# -------------------------------------------------------- +# Transport-level shim: normalize chat completion requests +# -------------------------------------------------------- +# Some lm-eval builds may emit Responses-style message shapes +# (message.type, role "developer", structured content lists). +# Many OpenAI-compatible servers for /v1/chat/completions expect +# classic roles (system/user/assistant) and string content. +# +# This shim rewrites payloads sent to */v1/chat/completions into +# the classic format. It is no-op for other endpoints. + +def _flatten_content_to_text(content): + if content is None: + return "" + if isinstance(content, str): + return content + if isinstance(content, list): + parts = [] + for p in content: + if not isinstance(p, dict): + continue + t = p.get("type") or p.get("role") + if t in ("text", "input_text", None): + txt = p.get("text") + if txt is None: + txt = p.get("content") + if txt is None and isinstance(p.get("text"), dict): + txt = p["text"].get("content") + if txt: + parts.append(str(txt)) + return "".join(parts) + try: + return str(content) + except Exception: + return "" + +def _normalize_messages(payload): + try: + msgs = payload.get("messages") + if not isinstance(msgs, list): + return payload + norm = [] + for m in msgs: + if not isinstance(m, dict): + continue + role = m.get("role", "user") + if role == "developer": + role = "system" + m = {k: v for k, v in m.items() if k != "type"} + content = m.get("content") + if content is None: + content = m.get("text") if isinstance(m.get("text"), (str, list, dict)) else m.get("input") + m_out = {"role": role, "content": _flatten_content_to_text(content)} + if isinstance(m.get("name"), str): + m_out["name"] = m["name"] + norm.append(m_out) + payload["messages"] = norm + except Exception: + return payload + return payload + +def _patch_http_clients(): + # requests + try: + import requests + _orig_req = requests.sessions.Session.request + def _wrapped_request(self, method, url, *args, **kwargs): + if isinstance(kwargs.get("json"), dict) and "/chat/completions" in str(url): + kwargs["json"] = _normalize_messages(dict(kwargs["json"])) + return _orig_req(self, method, url, *args, **kwargs) + requests.sessions.Session.request = _wrapped_request + except Exception: + pass + # httpx sync/async + try: + import httpx + _orig_httpx = httpx.Client.request + def _wrapped_httpx(self, method, url, *args, **kwargs): + if isinstance(kwargs.get("json"), dict) and "/chat/completions" in str(url): + kwargs["json"] = _normalize_messages(dict(kwargs["json"])) + return _orig_httpx(self, method, url, *args, **kwargs) + httpx.Client.request = _wrapped_httpx + _orig_async = httpx.AsyncClient.request + async def _wrapped_async(self, method, url, *args, **kwargs): + if isinstance(kwargs.get("json"), dict) and "/chat/completions" in str(url): + kwargs["json"] = _normalize_messages(dict(kwargs["json"])) + return await _orig_async(self, method, url, *args, **kwargs) + httpx.AsyncClient.request = _wrapped_async + except Exception: + pass + +if not os.environ.get("LM_EVAL_DISABLE_CHAT_SHIM"): + _patch_http_clients() # ----------------------------- # 1) Safe regex filters (yours) From 5a209fdd5453a52a8642115816e3cee0175b8eb5 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 16 Nov 2025 00:20:03 -0600 Subject: [PATCH 078/214] evals h200-trt-cw 6 (EP/TP HARD) --- .github/workflows/eval-gms8k.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index b0e90175a..a41894f12 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -32,7 +32,7 @@ on: description: "Sample limit for GSM8K" required: false type: string - default: "200" + default: "1300" push: paths: - '.github/workflows/eval-gms8k.yml' From 89a9cbddb089e1e8a2ed756b4d948db5f5dcf21b Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 16 Nov 2025 00:37:07 -0600 Subject: [PATCH 079/214] evals h200-cw dsr1 --- .github/workflows/eval-gms8k.yml | 14 +- benchmarks/benchmark_lib.sh | 206 ++---------------------------- benchmarks/dsr1_fp8_h200_slurm.sh | 4 + 3 files changed, 22 insertions(+), 202 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index a41894f12..47699b3a7 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -17,7 +17,7 @@ on: description: "Tensor Parallel Size" required: false type: string - default: "1" + default: "8" port: description: "Server port" required: false @@ -44,12 +44,12 @@ jobs: secrets: inherit with: runner: h200-cw_0 - image: ${{ inputs.image || 'nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev' }} - model: ${{ inputs.model || 'openai/gpt-oss-120b' }} - framework: trt - precision: fp4 - exp-name: gptoss_gsm8k_poc - tp: ${{ inputs.tp || '1' }} + image: ${{ inputs.image || 'lmsysorg/sglang:v0.5.5-cu129-amd64' }} + model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }} + framework: sglang + precision: fp8 + exp-name: dsr1_gsm8k_poc + tp: '8' ep: '1' dp-attn: false port: ${{ inputs.port || '8888' }} diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index e1115a602..8f38d9e9c 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -233,116 +233,17 @@ _patch_lm_eval_filters() { local patch_dir patch_dir="$(mktemp -d)" cat > "$patch_dir/sitecustomize.py" <<'PY' -# sitecustomize.py — loaded automatically by Python if on PYTHONPATH -import os, re, sys, unicodedata, types - -# -------------------------------------------------------- -# Transport-level shim: normalize chat completion requests -# -------------------------------------------------------- -# Some lm-eval builds may emit Responses-style message shapes -# (message.type, role "developer", structured content lists). -# Many OpenAI-compatible servers for /v1/chat/completions expect -# classic roles (system/user/assistant) and string content. -# -# This shim rewrites payloads sent to */v1/chat/completions into -# the classic format. It is no-op for other endpoints. - -def _flatten_content_to_text(content): - if content is None: - return "" - if isinstance(content, str): - return content - if isinstance(content, list): - parts = [] - for p in content: - if not isinstance(p, dict): - continue - t = p.get("type") or p.get("role") - if t in ("text", "input_text", None): - txt = p.get("text") - if txt is None: - txt = p.get("content") - if txt is None and isinstance(p.get("text"), dict): - txt = p["text"].get("content") - if txt: - parts.append(str(txt)) - return "".join(parts) - try: - return str(content) - except Exception: - return "" - -def _normalize_messages(payload): - try: - msgs = payload.get("messages") - if not isinstance(msgs, list): - return payload - norm = [] - for m in msgs: - if not isinstance(m, dict): - continue - role = m.get("role", "user") - if role == "developer": - role = "system" - m = {k: v for k, v in m.items() if k != "type"} - content = m.get("content") - if content is None: - content = m.get("text") if isinstance(m.get("text"), (str, list, dict)) else m.get("input") - m_out = {"role": role, "content": _flatten_content_to_text(content)} - if isinstance(m.get("name"), str): - m_out["name"] = m["name"] - norm.append(m_out) - payload["messages"] = norm - except Exception: - return payload - return payload - -def _patch_http_clients(): - # requests - try: - import requests - _orig_req = requests.sessions.Session.request - def _wrapped_request(self, method, url, *args, **kwargs): - if isinstance(kwargs.get("json"), dict) and "/chat/completions" in str(url): - kwargs["json"] = _normalize_messages(dict(kwargs["json"])) - return _orig_req(self, method, url, *args, **kwargs) - requests.sessions.Session.request = _wrapped_request - except Exception: - pass - # httpx sync/async - try: - import httpx - _orig_httpx = httpx.Client.request - def _wrapped_httpx(self, method, url, *args, **kwargs): - if isinstance(kwargs.get("json"), dict) and "/chat/completions" in str(url): - kwargs["json"] = _normalize_messages(dict(kwargs["json"])) - return _orig_httpx(self, method, url, *args, **kwargs) - httpx.Client.request = _wrapped_httpx - _orig_async = httpx.AsyncClient.request - async def _wrapped_async(self, method, url, *args, **kwargs): - if isinstance(kwargs.get("json"), dict) and "/chat/completions" in str(url): - kwargs["json"] = _normalize_messages(dict(kwargs["json"])) - return await _orig_async(self, method, url, *args, **kwargs) - httpx.AsyncClient.request = _wrapped_async - except Exception: - pass - -if not os.environ.get("LM_EVAL_DISABLE_CHAT_SHIM"): - _patch_http_clients() - -# ----------------------------- -# 1) Safe regex filters (yours) -# ----------------------------- +import re, sys, unicodedata from lm_eval.filters import extraction as ex def _s(x): # coerce to str return x if isinstance(x, str) else "" -# --- RegexFilter.apply --- +# --- Patch RegexFilter.apply (used by many datasets) --- _orig_regex_apply = ex.RegexFilter.apply def _safe_regex_apply(self, resps, docs): out = [] - for inst in resps: # list of candidates for one doc + for inst in resps: # inst is a list of candidate responses for one doc filtered = [] for resp in inst: txt = _s(resp) @@ -360,7 +261,7 @@ def _safe_regex_apply(self, resps, docs): return out ex.RegexFilter.apply = _safe_regex_apply -# --- MultiChoiceRegexFilter.apply (used by GSM8K flexible-extract) --- +# --- Patch MultiChoiceRegexFilter.apply (used by GSM8K flexible-extract) --- _orig_mc_apply = ex.MultiChoiceRegexFilter.apply def _safe_mc_apply(self, resps, docs): def find_match(regex, resp, convert_dict={}): @@ -397,7 +298,7 @@ def _safe_mc_apply(self, resps, docs): out = [] for r, doc in zip(resps, docs): - # Build fallback regexes from choices (A, B, C, ...) as upstream + # Build fallback regexes from choices (A, B, C, ...) as in upstream fallback_regexes, choice_to_alpha = [], {} next_alpha = "A" without_paren, without_paren_to_target = [], {} @@ -424,75 +325,8 @@ def _safe_mc_apply(self, resps, docs): filtered.append(m) out.append(filtered) return out -ex.MultiChoiceRegexFilter.apply = _safe_mc_apply - -# ----------------------------------------------------- -# 2) Fallback to reasoning_content in parse_generations -# ----------------------------------------------------- -# For OpenAI-like chat completions, some servers return: -# choices[0].message.content == None -# choices[0].message.reasoning_content == "" -# If so, return reasoning_content instead of None; if both missing, return "". - -from lm_eval.models.api_models import TemplateAPI - -def _wrap_parse_generations_on_class(cls): - if not hasattr(cls, "parse_generations"): - return - orig = cls.parse_generations - # parse_generations is a @staticmethod on API models; preserve staticmethod - def wrapped(*, outputs, **kwargs): - # First, run the original - res = orig(outputs=outputs, **kwargs) - # Normalize to list for convenience - if isinstance(res, (str, type(None))): - res = [res] - outputs_list = [outputs] - else: - outputs_list = outputs if isinstance(outputs, list) else [outputs] - - def _fallback_from_output(o): - try: - # OpenAI-style: dict -> choices[0] -> message - ch0 = (o or {}).get("choices", [{}])[0] - msg = ch0.get("message", {}) or {} - txt = msg.get("content") - if txt is None: - # Newer servers may use reasoning_content - txt = msg.get("reasoning_content") - if txt is None: - # Some servers put it at choices[0].reasoning.content - txt = (ch0.get("reasoning") or {}).get("content") - return "" if txt is None else txt - except Exception: - return "" - fb = [_fallback_from_output(o) for o in outputs_list] - # Replace None/empty only if a fallback exists - res_out = [] - for i, v in enumerate(res): - if (v is None or v == "") and i < len(fb) and fb[i]: - res_out.append(fb[i]) - else: - # still coerce None -> "" so downstream filters never see None - res_out.append("" if v is None else v) - return res_out - - # Rebind as staticmethod to match original decoration - cls.parse_generations = staticmethod(wrapped) - -# Try to patch common OpenAI-like chat backends -try: - from lm_eval.models import openai_like as oli - for name in dir(oli): - obj = getattr(oli, name) - if isinstance(obj, type) and issubclass(obj, TemplateAPI): - # Heuristically target chat-style classes only - if "Chat" in obj.__name__ or "OpenAI" in obj.__name__: - _wrap_parse_generations_on_class(obj) -except Exception: - # If module layout changes, fail soft; your regex guards still protect filters. - pass +ex.MultiChoiceRegexFilter.apply = _safe_mc_apply PY export PYTHONPATH="${patch_dir}:${PYTHONPATH:-}" } @@ -565,34 +399,16 @@ append_lm_eval_summary() { set +x local results_dir="${EVAL_RESULT_DIR:-eval_out}" local task="${EVAL_TASK:-gsm8k}" - # Render markdown once, then decide where to write it to avoid redirection errors - local md_out - md_out=$(python3 utils/lm_eval_to_md.py \ + if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then + _ensure_bench_serving_repo + python3 XXX \ --results-dir "/workspace/${results_dir}" \ --task "${task}" \ --framework "${FRAMEWORK}" \ --precision "${PRECISION}" \ --tp "${TP:-1}" \ --ep "${EP_SIZE:-1}" \ - --dp-attention "${DP_ATTENTION:-false}" 2>/dev/null || true) - - # If nothing was produced, nothing to append - if [ -z "${md_out}" ]; then - return 0 + --dp-attention "${DP_ATTENTION:-false}" \ + >> "$GITHUB_STEP_SUMMARY" || true fi - - # Prefer GitHub step summary when available and path is valid; otherwise fallback to workspace file - if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then - local _gh_path="$GITHUB_STEP_SUMMARY" - local _gh_dir - _gh_dir="$(dirname "$_gh_path")" - if [ -d "$_gh_dir" ]; then - printf "%s\n" "${md_out}" >> "$_gh_path" || true - return 0 - fi - fi - - # Fallback: write to a summary file alongside results - mkdir -p "/workspace/${results_dir}" 2>/dev/null || true - printf "%s\n" "${md_out}" >> "/workspace/${results_dir}/SUMMARY.md" || true } diff --git a/benchmarks/dsr1_fp8_h200_slurm.sh b/benchmarks/dsr1_fp8_h200_slurm.sh index 06345ecb2..8e8ec7469 100644 --- a/benchmarks/dsr1_fp8_h200_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_slurm.sh @@ -63,3 +63,7 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation (defaults to GSM8K) +run_lm_eval --port "$PORT" +append_lm_eval_summary \ No newline at end of file From 9254ef19d5c9bb26a535240df0a2c9192e07640e Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 16 Nov 2025 00:56:20 -0600 Subject: [PATCH 080/214] evals mi300x-cr dsr1 --- .github/workflows/eval-gms8k.yml | 4 ++-- benchmarks/benchmark_lib.sh | 4 ++-- benchmarks/dsr1_fp8_mi300x_docker.sh | 4 ++++ 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 47699b3a7..1689cc9c1 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -43,8 +43,8 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: h200-cw_0 - image: ${{ inputs.image || 'lmsysorg/sglang:v0.5.5-cu129-amd64' }} + runner: mi300x-cr_0 + image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' }} model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }} framework: sglang precision: fp8 diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 8f38d9e9c..861d0f483 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -347,8 +347,8 @@ run_lm_eval() { local task="${EVAL_TASK:-gsm8k}" local num_fewshot="${NUM_FEWSHOT:-5}" local results_dir="${EVAL_RESULT_DIR:-eval_out}" - local batch_size=2 - local gen_max_tokens=8192 + local batch_size=3 + local gen_max_tokens=4096 local temperature=0 local top_p=1 diff --git a/benchmarks/dsr1_fp8_mi300x_docker.sh b/benchmarks/dsr1_fp8_mi300x_docker.sh index 8c269dd83..3e604f3ca 100644 --- a/benchmarks/dsr1_fp8_mi300x_docker.sh +++ b/benchmarks/dsr1_fp8_mi300x_docker.sh @@ -56,3 +56,7 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation (defaults to GSM8K) +run_lm_eval --port "$PORT" +append_lm_eval_summary \ No newline at end of file From 6705ea3a4b8bc68caa2654cffd1ef7cdfc271360 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 16 Nov 2025 00:58:07 -0600 Subject: [PATCH 081/214] evals mi300x-cr dsr1 2 --- .github/workflows/eval-tmpl.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml index c2363540a..274ddcf92 100644 --- a/.github/workflows/eval-tmpl.yml +++ b/.github/workflows/eval-tmpl.yml @@ -101,6 +101,7 @@ jobs: docker ps -aq fi fi + sudo rm -rf /home/ubuntu/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out/openai__gpt-oss-120b if command -v squeue >/dev/null 2>&1; then echo "[Slurm] Cleaning up resources ..." scancel -u $USER || true From c1fc6db41958493ca82ac9fbc3b5ac8f402bf4ce Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 16 Nov 2025 01:14:08 -0600 Subject: [PATCH 082/214] evals mi325x-cr dsr1 --- .github/workflows/eval-gms8k.yml | 2 +- benchmarks/benchmark_lib.sh | 3 +-- benchmarks/dsr1_fp8_mi325x_slurm.sh | 4 ++++ 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 1689cc9c1..e7bb716ca 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -43,7 +43,7 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: mi300x-cr_0 + runner: mi325x-amd_0 image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' }} model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }} framework: sglang diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 861d0f483..224b4dfd5 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -400,8 +400,7 @@ append_lm_eval_summary() { local results_dir="${EVAL_RESULT_DIR:-eval_out}" local task="${EVAL_TASK:-gsm8k}" if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then - _ensure_bench_serving_repo - python3 XXX \ + python3 utils/lm_eval_to_md.py \ --results-dir "/workspace/${results_dir}" \ --task "${task}" \ --framework "${FRAMEWORK}" \ diff --git a/benchmarks/dsr1_fp8_mi325x_slurm.sh b/benchmarks/dsr1_fp8_mi325x_slurm.sh index 67e4cc394..4e66a64fb 100644 --- a/benchmarks/dsr1_fp8_mi325x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi325x_slurm.sh @@ -42,3 +42,7 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation (defaults to GSM8K) +run_lm_eval --port "$PORT" +append_lm_eval_summary \ No newline at end of file From 090630a1555b6dd6b42102cf3be488b34b3de700 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 16 Nov 2025 01:28:05 -0600 Subject: [PATCH 083/214] evals mi325x-cr dsr1 2 --- .github/workflows/eval-gms8k.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index e7bb716ca..2c1347cb9 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -43,7 +43,7 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: mi325x-amd_0 + runner: mi325x-tw_0 image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' }} model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }} framework: sglang From d984d7a06ec27111bcfd48d4d0233c067ef2ec5a Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 16 Nov 2025 01:31:39 -0600 Subject: [PATCH 084/214] evals mi355x-amd dsr1 --- .github/workflows/eval-gms8k.yml | 2 +- .github/workflows/eval-tmpl.yml | 2 +- benchmarks/dsr1_fp8_mi355x_docker.sh | 6 ++++-- benchmarks/dsr1_fp8_mi355x_slurm.sh | 5 +++++ 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 2c1347cb9..50c2390a9 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -43,7 +43,7 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: mi325x-tw_0 + runner: mi355x-amd_4 image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' }} model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }} framework: sglang diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml index 274ddcf92..433ec1a68 100644 --- a/.github/workflows/eval-tmpl.yml +++ b/.github/workflows/eval-tmpl.yml @@ -65,7 +65,7 @@ env: LIMIT: ${{ inputs.limit }} EVAL_RESULT_DIR: eval_out # Server-side concurrency default (used by some server scripts) - CONC: '16' + CONC: '8' MAX_MODEL_LEN: '8192' ISL: 1024 OSL: 1024 diff --git a/benchmarks/dsr1_fp8_mi355x_docker.sh b/benchmarks/dsr1_fp8_mi355x_docker.sh index d4f1dd013..17e51344a 100644 --- a/benchmarks/dsr1_fp8_mi355x_docker.sh +++ b/benchmarks/dsr1_fp8_mi355x_docker.sh @@ -58,5 +58,7 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ - - +# After throughput, run evaluation (defaults to GSM8K) +run_lm_eval --port "$PORT" +append_lm_eval_summary +set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_mi355x_slurm.sh b/benchmarks/dsr1_fp8_mi355x_slurm.sh index fd6fe49fb..b16c8e247 100644 --- a/benchmarks/dsr1_fp8_mi355x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi355x_slurm.sh @@ -51,3 +51,8 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation (defaults to GSM8K) +run_lm_eval --port "$PORT" +append_lm_eval_summary +set +x \ No newline at end of file From fb66e33543ffeb769422cae7ce9a886531f8b288 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 16 Nov 2025 07:48:59 -0600 Subject: [PATCH 085/214] evals mi355x-amd dsr1 2 --- .github/workflows/eval-tmpl.yml | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml index 433ec1a68..9354c4f67 100644 --- a/.github/workflows/eval-tmpl.yml +++ b/.github/workflows/eval-tmpl.yml @@ -80,16 +80,20 @@ jobs: steps: - name: Resource cleanup run: | - if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then + # Helper to avoid indefinite hangs on flaky Docker daemons + safe_timeout() { timeout -k 5 30s "$@"; } + + if command -v docker >/dev/null 2>&1 && safe_timeout docker info >/dev/null 2>&1; then host=$(hostname) if [[ "$host" == "b200-81" || "$host" == "b200-80" || "$host" == "b200-79" ]]; then echo "[INFO] Running container-by-container cleanup on $host" - for cid in $(docker ps -aq); do + cids=$(safe_timeout docker ps -aq || true) + for cid in $cids; do echo "[INFO] Cleaning container $cid" - docker stop -t 90 "$cid" || true - docker wait "$cid" >/dev/null 2>&1 || true - docker rm -f "$cid" >/dev/null 2>&1 || true + safe_timeout docker stop -t 90 "$cid" || true + safe_timeout docker wait "$cid" >/dev/null 2>&1 || true + safe_timeout docker rm -f "$cid" >/dev/null 2>&1 || true done sleep 2 if nvidia-smi --query-compute-apps=pid --format=csv,noheader | grep -q '[0-9]'; then @@ -98,7 +102,7 @@ jobs: fi else echo "[Docker] looking at docker resources ..." - docker ps -aq + safe_timeout docker ps -aq || echo "[WARN] docker ps timed out" fi fi sudo rm -rf /home/ubuntu/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out/openai__gpt-oss-120b From d0eb0c4e2ab4369be6c98d361c31512a95ce86f3 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 16 Nov 2025 08:02:14 -0600 Subject: [PATCH 086/214] evals mi355x-amd dsr1 3 --- .github/workflows/eval-tmpl.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml index 9354c4f67..c98f4be0d 100644 --- a/.github/workflows/eval-tmpl.yml +++ b/.github/workflows/eval-tmpl.yml @@ -83,10 +83,9 @@ jobs: # Helper to avoid indefinite hangs on flaky Docker daemons safe_timeout() { timeout -k 5 30s "$@"; } - if command -v docker >/dev/null 2>&1 && safe_timeout docker info >/dev/null 2>&1; then - host=$(hostname) - - if [[ "$host" == "b200-81" || "$host" == "b200-80" || "$host" == "b200-79" ]]; then + host=$(hostname) + if [[ "$host" == "b200-81" || "$host" == "b200-80" || "$host" == "b200-79" ]]; then + if command -v docker >/dev/null 2>&1; then echo "[INFO] Running container-by-container cleanup on $host" cids=$(safe_timeout docker ps -aq || true) for cid in $cids; do @@ -101,9 +100,10 @@ jobs: nvidia-smi || true fi else - echo "[Docker] looking at docker resources ..." - safe_timeout docker ps -aq || echo "[WARN] docker ps timed out" + echo "[Docker] docker client not found; skipping cleanup" fi + else + echo "[Docker] skipping docker cleanup on host $host" fi sudo rm -rf /home/ubuntu/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out/openai__gpt-oss-120b if command -v squeue >/dev/null 2>&1; then From c1dc1a6b3e66c5042a39c083825bdb03d41e76a3 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 16 Nov 2025 08:07:21 -0600 Subject: [PATCH 087/214] evals mi355x-amd dsr1 4 --- .github/workflows/eval-gms8k.yml | 6 +++--- .github/workflows/eval-tmpl.yml | 31 +++++++++++++++++++++++++------ 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 50c2390a9..860ee5987 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -43,11 +43,11 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: mi355x-amd_4 + runner: mi355x-amd_5 image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' }} - model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }} + model: ${{ inputs.model || 'amd/DeepSeek-R1-0528-MXFP4-Preview' }} framework: sglang - precision: fp8 + precision: fp4 exp-name: dsr1_gsm8k_poc tp: '8' ep: '1' diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml index c98f4be0d..2d2e8404a 100644 --- a/.github/workflows/eval-tmpl.yml +++ b/.github/workflows/eval-tmpl.yml @@ -80,8 +80,14 @@ jobs: steps: - name: Resource cleanup run: | - # Helper to avoid indefinite hangs on flaky Docker daemons - safe_timeout() { timeout -k 5 30s "$@"; } + # Helper to avoid indefinite hangs on flaky tools (Docker/Slurm) + safe_timeout() { + if command -v timeout >/dev/null 2>&1; then + timeout -k 5 30s "$@" + else + "$@" + fi + } host=$(hostname) if [[ "$host" == "b200-81" || "$host" == "b200-80" || "$host" == "b200-79" ]]; then @@ -105,14 +111,27 @@ jobs: else echo "[Docker] skipping docker cleanup on host $host" fi - sudo rm -rf /home/ubuntu/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out/openai__gpt-oss-120b + # Best-effort cleanup of prior eval outputs; do not block + safe_timeout sudo rm -rf /home/ubuntu/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out/openai__gpt-oss-120b || true + if command -v squeue >/dev/null 2>&1; then echo "[Slurm] Cleaning up resources ..." - scancel -u $USER || true - while [ -n "$(squeue -u $USER --noheader --format='%i')" ]; do - squeue -u $USER || true + safe_timeout scancel -u "$USER" || true + # Wait up to 5 minutes for jobs to clear to avoid indefinite hang + end=$((SECONDS + 300)) + while [ $SECONDS -lt $end ]; do + queued=$(safe_timeout squeue -u "$USER" --noheader --format='%i' 2>/dev/null || true) + if [ -z "$queued" ]; then + break + fi + echo "$queued" | sed 's/^/[Slurm] pending job: /' || true sleep 5 done + # Final status; do not block + safe_timeout squeue -u "$USER" || true + if [ -n "$(safe_timeout squeue -u "$USER" --noheader --format='%i' 2>/dev/null || true)" ]; then + echo "[Slurm] Jobs still present after timeout; proceeding" + fi fi - uses: actions/checkout@v5 From 88d3bf5730d0c71d83dafcbfdbb082416de0a887 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 16 Nov 2025 08:15:45 -0600 Subject: [PATCH 088/214] evals b200-nvd dsr1 --- .github/workflows/eval-gms8k.yml | 6 +++--- benchmarks/dsr1_fp4_b200_docker.sh | 3 +++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 860ee5987..52bdde77e 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -43,9 +43,9 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: mi355x-amd_5 - image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' }} - model: ${{ inputs.model || 'amd/DeepSeek-R1-0528-MXFP4-Preview' }} + runner: b200-nvd_2 + image: ${{ inputs.image || 'lmsysorg/sglang:v0.5.5-cu129-amd64' }} + model: ${{ inputs.model || 'nvidia/DeepSeek-R1-0528-FP4-V2' }} framework: sglang precision: fp4 exp-name: dsr1_gsm8k_poc diff --git a/benchmarks/dsr1_fp4_b200_docker.sh b/benchmarks/dsr1_fp4_b200_docker.sh index a520871fa..656085fef 100644 --- a/benchmarks/dsr1_fp4_b200_docker.sh +++ b/benchmarks/dsr1_fp4_b200_docker.sh @@ -48,3 +48,6 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ +# After throughput, run evaluation (defaults to GSM8K) +run_lm_eval --port "$PORT" +append_lm_eval_summary \ No newline at end of file From 8a0677d29c6c2ffd0f5ee7ead02f5ed3b2c10dc5 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 16 Nov 2025 08:20:21 -0600 Subject: [PATCH 089/214] evals b200-nvd fp8 dsr1 --- .github/workflows/eval-gms8k.yml | 4 ++-- benchmarks/dsr1_fp8_b200_docker.sh | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 52bdde77e..eea4a707f 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -45,9 +45,9 @@ jobs: with: runner: b200-nvd_2 image: ${{ inputs.image || 'lmsysorg/sglang:v0.5.5-cu129-amd64' }} - model: ${{ inputs.model || 'nvidia/DeepSeek-R1-0528-FP4-V2' }} + model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }} framework: sglang - precision: fp4 + precision: fp8 exp-name: dsr1_gsm8k_poc tp: '8' ep: '1' diff --git a/benchmarks/dsr1_fp8_b200_docker.sh b/benchmarks/dsr1_fp8_b200_docker.sh index ffa7644bd..e68397661 100644 --- a/benchmarks/dsr1_fp8_b200_docker.sh +++ b/benchmarks/dsr1_fp8_b200_docker.sh @@ -57,4 +57,8 @@ run_benchmark_serving \ --num-prompts "$NUM_PROMPTS" \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ No newline at end of file + --result-dir /workspace/ + +# After throughput, run evaluation (defaults to GSM8K) +run_lm_eval --port "$PORT" +append_lm_eval_summary \ No newline at end of file From f862af787c80e9725c40915bb8997292e2cf6146 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Thu, 20 Nov 2025 20:51:15 -0600 Subject: [PATCH 090/214] Lighteval 1 --- .github/workflows/eval-gms8k.yml | 10 +- benchmarks/benchmark_lib.sh | 486 ++++++++++++++++--------- benchmarks/gptoss_fp4_mi300x_docker.sh | 8 +- 3 files changed, 318 insertions(+), 186 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index eea4a707f..9ed618cda 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -43,11 +43,11 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: b200-nvd_2 - image: ${{ inputs.image || 'lmsysorg/sglang:v0.5.5-cu129-amd64' }} - model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }} - framework: sglang - precision: fp8 + runner: mi300x-amd_0 + image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }} + model: ${{ inputs.model || 'openai/gpt-oss-120b' }} + framework: vllm + precision: fp4 exp-name: dsr1_gsm8k_poc tp: '8' ep: '1' diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 224b4dfd5..bf96c8bda 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -1,12 +1,15 @@ #!/usr/bin/env bash -# Shared benchmarking utilities for InferenceMAX +# Shared benchmarking + evaluation utilities for InferenceMAX + +# --------------------------------- +# Server readiness / benchmarks +# --------------------------------- # Wait for server to be ready by polling the health endpoint -# All parameters are required # Parameters: -# --port: Server port -# --server-log: Path to server log file +# --port: Server port (required) +# --server-log: Path to server log file (required) # --server-pid: Server process ID (required) # --sleep-interval: Sleep interval between health checks (optional, default: 5) wait_for_server_ready() { @@ -16,73 +19,37 @@ wait_for_server_ready() { local server_pid="" local sleep_interval=5 - # Parse arguments while [[ $# -gt 0 ]]; do case $1 in - --port) - port="$2" - shift 2 - ;; - --server-log) - server_log="$2" - shift 2 - ;; - --server-pid) - server_pid="$2" - shift 2 - ;; - --sleep-interval) - sleep_interval="$2" - shift 2 - ;; - *) - echo "Unknown parameter: $1" - return 1 - ;; + --port) port="$2"; shift 2 ;; + --server-log) server_log="$2"; shift 2 ;; + --server-pid) server_pid="$2"; shift 2 ;; + --sleep-interval) sleep_interval="$2"; shift 2 ;; + *) echo "Unknown parameter: $1"; return 1 ;; esac done - # Validate required parameters - if [[ -z "$port" ]]; then - echo "Error: --port is required" - return 1 - fi - if [[ -z "$server_log" ]]; then - echo "Error: --server-log is required" - return 1 - fi - if [[ -z "$server_pid" ]]; then - echo "Error: --server-pid is required" - return 1 - fi + if [[ -z "$port" ]]; then echo "Error: --port is required"; return 1; fi + if [[ -z "$server_log" ]]; then echo "Error: --server-log is required"; return 1; fi + if [[ -z "$server_pid" ]]; then echo "Error: --server-pid is required"; return 1; fi # Show logs until server is ready tail -f "$server_log" & local TAIL_PID=$! - until curl --output /dev/null --silent --fail http://0.0.0.0:$port/health; do + + until curl --output /dev/null --silent --fail "http://0.0.0.0:$port/health"; do if ! kill -0 "$server_pid" 2>/dev/null; then echo "Server died before becoming healthy. Exiting." - kill $TAIL_PID + kill "$TAIL_PID" exit 1 fi sleep "$sleep_interval" done - kill $TAIL_PID + kill "$TAIL_PID" } # Run benchmark serving with standardized parameters -# All parameters are required -# Parameters: -# --model: Model name -# --port: Server port -# --backend: Backend type - 'vllm' or 'openai' -# --input-len: Random input sequence length -# --output-len: Random output sequence length -# --random-range-ratio: Random range ratio -# --num-prompts: Number of prompts -# --max-concurrency: Max concurrency -# --result-filename: Result filename without extension -# --result-dir: Result directory +# All parameters are required unless otherwise noted run_benchmark_serving() { set +x local model="" @@ -95,104 +62,43 @@ run_benchmark_serving() { local max_concurrency="" local result_filename="" local result_dir="" + local tokenizer="" - # Parse arguments while [[ $# -gt 0 ]]; do case $1 in - --model) - model="$2" - shift 2 - ;; - --port) - port="$2" - shift 2 - ;; - --backend) - backend="$2" - shift 2 - ;; - --input-len) - input_len="$2" - shift 2 - ;; - --output-len) - output_len="$2" - shift 2 - ;; - --random-range-ratio) - random_range_ratio="$2" - shift 2 - ;; - --num-prompts) - num_prompts="$2" - shift 2 - ;; - --max-concurrency) - max_concurrency="$2" - shift 2 - ;; - --result-filename) - result_filename="$2" - shift 2 - ;; - --result-dir) - result_dir="$2" - shift 2 - ;; - *) - echo "Unknown parameter: $1" - return 1 - ;; + --model) model="$2"; shift 2 ;; + --port) port="$2"; shift 2 ;; + --backend) backend="$2"; shift 2 ;; + --input-len) input_len="$2"; shift 2 ;; + --output-len) output_len="$2"; shift 2 ;; + --random-range-ratio) random_range_ratio="$2"; shift 2 ;; + --num-prompts) num_prompts="$2"; shift 2 ;; + --max-concurrency) max_concurrency="$2"; shift 2 ;; + --result-filename) result_filename="$2"; shift 2 ;; + --result-dir) result_dir="$2"; shift 2 ;; + --tokenizer) tokenizer="$2"; shift 2 ;; + *) echo "Unknown parameter: $1"; return 1 ;; esac done - # Validate all required parameters - if [[ -z "$model" ]]; then - echo "Error: --model is required" - return 1 - fi - if [[ -z "$port" ]]; then - echo "Error: --port is required" - return 1 - fi - if [[ -z "$backend" ]]; then - echo "Error: --backend is required" - return 1 - fi - if [[ -z "$input_len" ]]; then - echo "Error: --input-len is required" - return 1 - fi - if [[ -z "$output_len" ]]; then - echo "Error: --output-len is required" - return 1 - fi - if [[ -z "$random_range_ratio" ]]; then - echo "Error: --random-range-ratio is required" - return 1 - fi - if [[ -z "$num_prompts" ]]; then - echo "Error: --num-prompts is required" - return 1 - fi - if [[ -z "$max_concurrency" ]]; then - echo "Error: --max-concurrency is required" - return 1 - fi - if [[ -z "$result_filename" ]]; then - echo "Error: --result-filename is required" - return 1 - fi - if [[ -z "$result_dir" ]]; then - echo "Error: --result-dir is required" - return 1 - fi + # Validation + local vars=(model port backend input_len output_len random_range_ratio num_prompts max_concurrency result_filename result_dir) + for var in "${vars[@]}"; do + if [[ -z "${!var}" ]]; then + echo "Error: --${var//_/-} is required" + return 1 + fi + done - # Clone benchmark serving repo - local BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) + local BENCH_SERVING_DIR + BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) git clone https://github.com/kimbochen/bench_serving.git "$BENCH_SERVING_DIR" - # Run benchmark + local extra_tokenizer_args=() + if [[ -n "$tokenizer" ]]; then + extra_tokenizer_args=(--tokenizer "$tokenizer") + fi + set -x python3 "$BENCH_SERVING_DIR/benchmark_serving.py" \ --model "$model" \ @@ -204,6 +110,7 @@ run_benchmark_serving() { --random-range-ratio "$random_range_ratio" \ --num-prompts "$num_prompts" \ --max-concurrency "$max_concurrency" \ + "${extra_tokenizer_args[@]}" \ --request-rate inf \ --ignore-eos \ --save-result \ @@ -218,7 +125,6 @@ run_benchmark_serving() { # Eval (lm-eval-harness) helpers # ------------------------------ -# Install or update lm-eval dependencies _install_lm_eval_deps() { set +x python3 -m pip install -q --no-cache-dir "lm-eval[api]" || true @@ -239,11 +145,11 @@ from lm_eval.filters import extraction as ex def _s(x): # coerce to str return x if isinstance(x, str) else "" -# --- Patch RegexFilter.apply (used by many datasets) --- +# --- Patch RegexFilter.apply --- _orig_regex_apply = ex.RegexFilter.apply def _safe_regex_apply(self, resps, docs): out = [] - for inst in resps: # inst is a list of candidate responses for one doc + for inst in resps: filtered = [] for resp in inst: txt = _s(resp) @@ -261,7 +167,7 @@ def _safe_regex_apply(self, resps, docs): return out ex.RegexFilter.apply = _safe_regex_apply -# --- Patch MultiChoiceRegexFilter.apply (used by GSM8K flexible-extract) --- +# --- Patch MultiChoiceRegexFilter.apply --- _orig_mc_apply = ex.MultiChoiceRegexFilter.apply def _safe_mc_apply(self, resps, docs): def find_match(regex, resp, convert_dict={}): @@ -298,7 +204,6 @@ def _safe_mc_apply(self, resps, docs): out = [] for r, doc in zip(resps, docs): - # Build fallback regexes from choices (A, B, C, ...) as in upstream fallback_regexes, choice_to_alpha = [], {} next_alpha = "A" without_paren, without_paren_to_target = [], {} @@ -331,16 +236,6 @@ PY export PYTHONPATH="${patch_dir}:${PYTHONPATH:-}" } -# Run an lm-eval-harness task against a local OpenAI-compatible server -# Parameters: -# --port: Server port (default: $PORT or 8888) -# --task: Eval task (default: $EVAL_TASK or gsm8k) -# --num-fewshot: Fewshot k (default: $NUM_FEWSHOT or 5) -# --results-dir: Output dir (default: $EVAL_RESULT_DIR or eval_out) -# --batch-size: Harness batch size (default: 2) -# --gen-max-tokens: Max tokens for generation (default: 8192) -# --temperature: Temperature (default: 0) -# --top-p: Top-p (default: 1) run_lm_eval() { set +x local port="${PORT:-8888}" @@ -352,35 +247,25 @@ run_lm_eval() { local temperature=0 local top_p=1 - # Parse arguments while [[ $# -gt 0 ]]; do case $1 in - --port) - port="$2"; shift 2;; - --task) - task="$2"; shift 2;; - --num-fewshot) - num_fewshot="$2"; shift 2;; - --results-dir) - results_dir="$2"; shift 2;; - --batch-size) - batch_size="$2"; shift 2;; - --gen-max-tokens) - gen_max_tokens="$2"; shift 2;; - --temperature) - temperature="$2"; shift 2;; - --top-p) - top_p="$2"; shift 2;; - *) - echo "Unknown parameter: $1"; return 1;; + --port) port="$2"; shift 2 ;; + --task) task="$2"; shift 2 ;; + --num-fewshot) num_fewshot="$2"; shift 2 ;; + --results-dir) results_dir="$2"; shift 2 ;; + --batch-size) batch_size="$2"; shift 2 ;; + --gen-max-tokens) gen_max_tokens="$2"; shift 2 ;; + --temperature) temperature="$2"; shift 2 ;; + --top-p) top_p="$2"; shift 2 ;; + *) echo "Unknown parameter: $1"; return 1 ;; esac done - + _install_lm_eval_deps _patch_lm_eval_filters local openai_server_base="http://0.0.0.0:${port}" - local openai_chat_base="$openai_server_base/v1/chat/completions" + local openai_chat_base="${openai_server_base}/v1/chat/completions" export OPENAI_API_KEY=${OPENAI_API_KEY:-EMPTY} set -x @@ -394,7 +279,6 @@ run_lm_eval() { set +x } -# Append a Markdown summary to GitHub step summary (no-op if not in GH Actions) append_lm_eval_summary() { set +x local results_dir="${EVAL_RESULT_DIR:-eval_out}" @@ -411,3 +295,247 @@ append_lm_eval_summary() { >> "$GITHUB_STEP_SUMMARY" || true fi } + + +# ------------------------------ +# Lighteval + LiteLLM patching +# ------------------------------ + +_install_lighteval_deps() { + set +x + python3 -m pip install -q --no-cache-dir "lighteval[api]" "litellm" || true +} + +# Patch lighteval's LiteLLMClient to handle reasoning content and Python name mangling +_patch_lighteval_litellm() { + set +x + local patch_dir + patch_dir="$(mktemp -d)" + cat > "$patch_dir/sitecustomize.py" <<'PY' +import logging +import time + +import litellm +from tqdm import tqdm + +litellm.suppress_debug_info = True + +from lighteval.models.endpoints.litellm_model import LiteLLMClient +from lighteval.data import GenerativeTaskDataset +from lighteval.tasks.requests import Doc, SamplingMethod +from lighteval.models.model_output import ModelResponse +from lighteval.utils.cache_management import cached + +logger = logging.getLogger(__name__) + +# --- Patched __call_api: don't retry when we have reasoning_content --- +def _patched___call_api(self, prompt, return_logits, max_new_tokens, num_samples, stop_sequence): # noqa: C901 + """Make API call with retries, but don't treat reasoning-only responses as empty.""" + from lighteval.models.endpoints.litellm_model import LitellmModelResponse + + response = LitellmModelResponse() + stop_sequence = self._prepare_stop_sequence(stop_sequence) + max_new_tokens = self._prepare_max_new_tokens(max_new_tokens) + + if return_logits and not self.provider == "openai": + logger.warning("Returning logits is not supported for this provider, ignoring.") + + kwargs = { + "model": self.model, + "messages": prompt, + "response_format": {"type": "text"}, + "max_tokens": max_new_tokens, + "logprobs": return_logits if self.provider == "openai" else None, + "stop": stop_sequence, + "base_url": self.base_url, + "api_key": self.api_key, + "n": num_samples, + "caching": True, + "timeout": self.timeout, + } + + if "o1" in self.model: + logger.warning("O1 models do not support temperature, top_p, stop sequence. Disabling.") + else: + kwargs.update(self.generation_parameters.to_litellm_dict()) + + if kwargs.get("max_completion_tokens", None) is None: + kwargs["max_completion_tokens"] = max_new_tokens + + for attempt in range(self.API_MAX_RETRY): + try: + response = litellm.completion(**kwargs) + msg = response.choices[0].message + content = msg.content + reasoning = getattr(msg, "reasoning_content", None) + + if (not content) and reasoning: + return response + + if not content: + logger.info("Response is empty, retrying without caching") + kwargs["caching"] = False + response = litellm.completion(**kwargs) + msg = response.choices[0].message + content = msg.content + reasoning = getattr(msg, "reasoning_content", None) + + return response + except litellm.BadRequestError as e: + if "message" in e.__dict__ and "policy" in e.__dict__["message"]: + logger.warning(f"Content filtered. Returning empty response.") + return LitellmModelResponse() + except Exception as e: + wait_time = min(64, self.API_RETRY_SLEEP * (self.API_RETRY_MULTIPLIER**attempt)) + logger.warning(f"Error: {e}, waiting {wait_time}s before retry {attempt + 1}/{self.API_MAX_RETRY}") + time.sleep(wait_time) + + logger.error(f"API call failed after {self.API_MAX_RETRY} attempts.") + return LitellmModelResponse() + +# APPLY PATCH: Must use mangled name because original was private (__call_api) +LiteLLMClient._LiteLLMClient__call_api = _patched___call_api + +# --- Patched greedy_until: merge reasoning + content, preserve ordering --- +def _greedy_until_impl(self, docs: list[Doc]) -> list[ModelResponse]: + dataset = GenerativeTaskDataset(requests=docs, num_dataset_splits=self.DATASET_SPLITS) + results: list[ModelResponse] = [] + + for split in tqdm( + dataset.splits_iterator(), + total=dataset.num_dataset_splits, + desc="Splits", + position=0, + disable=self.disable_tqdm, + ): + contexts = [self.prompt_manager.prepare_prompt_api(doc) for doc in dataset] + + max_new_tokens = split[0].generation_size + return_logits = split[0].use_logits + num_samples = split[0].num_samples + stop_sequence = split[0].stop_sequences + + if num_samples > 1 and self.generation_parameters.temperature == 0: + raise ValueError("num_samples > 1 requires temperature > 0") + + # CRITICAL FIX: Access the private method via mangled name + responses = self._LiteLLMClient__call_api_parallel( + contexts, + return_logits, + max_new_tokens, + num_samples, + stop_sequence, + ) + + for response, context in zip(responses, contexts): + raw_contents = [(choice.message.content or "").strip() for choice in response.choices] + raw_reasonings = [(getattr(choice.message, "reasoning_content", None) or "").strip() for choice in response.choices] + + merged: list[str] = [] + for c, r in zip(raw_contents, raw_reasonings): + if c and r: + merged.append(r + "\n\n" + c) + elif c: + merged.append(c) + elif r: + merged.append(r) + else: + merged.append("") + + reasonings: list[str | None] = [r if r != "" else None for r in raw_reasonings] + + if not merged or merged[0] is None: + merged = [""] + + cur_response = ModelResponse( + text=merged, + reasonings=reasonings, + input=context, + ) + results.append(cur_response) + + if len(results) != len(dataset): + raise RuntimeError(f"Internal mismatch: {len(results)} outputs vs {len(dataset)} docs.") + + return dataset.get_original_order(results) + +# Re-apply caching decorator +LiteLLMClient.greedy_until = cached(SamplingMethod.GENERATIVE)(_greedy_until_impl) +PY + export PYTHONPATH="${patch_dir}:${PYTHONPATH:-}" +} + +run_lighteval_eval() { + set +x + local port="${PORT:-8888}" + local task="${EVAL_TASK:-gsm8k}" + local num_fewshot="${NUM_FEWSHOT:-5}" + local results_dir="${EVAL_RESULT_DIR:-eval_out_lighteval}" + local max_samples=0 + + while [[ $# -gt 0 ]]; do + case $1 in + --port) port="$2"; shift 2 ;; + --task) task="$2"; shift 2 ;; + --num-fewshot) num_fewshot="$2"; shift 2 ;; + --results-dir) results_dir="$2"; shift 2 ;; + --max-samples) max_samples="$2"; shift 2 ;; + *) echo "Unknown parameter: $1"; return 1 ;; + esac + done + + _install_lighteval_deps + _patch_lighteval_litellm + + # Prefer OPENAI_MODEL_NAME, then EVAL_MODEL_NAME, then MODEL + local model_name="${EVAL_MODEL_NAME:-${OPENAI_MODEL_NAME:-${MODEL}}}" + if [[ -z "$model_name" ]]; then + echo "Error: EVAL_MODEL_NAME / OPENAI_MODEL_NAME / MODEL not set for lighteval." >&2 + return 1 + fi + + # LiteLLM provider prefix logic + local lite_model="$model_name" + if [[ "$lite_model" != openai/* ]]; then + lite_model="openai/${lite_model}" + fi + + local base_url="http://0.0.0.0:${port}/v1" + export OPENAI_API_KEY="${OPENAI_API_KEY:-EMPTY}" + + local MODEL_ARGS="model_name=${lite_model},base_url=${base_url},api_key=${OPENAI_API_KEY}" + local TASK_SPEC="${task}|${num_fewshot}" + + set -x + lighteval endpoint litellm \ + "${MODEL_ARGS}" \ + "${TASK_SPEC}" \ + --output-dir "/workspace/${results_dir}" \ + --max-samples "${max_samples}" \ + --remove-reasoning-tags + set +x +} + + +# ------------------------------ +# Unified eval entrypoint +# ------------------------------ + +run_eval() { + set +x + local framework="${EVAL_FRAMEWORK:-lm-eval}" + local forwarded=() + + while [[ $# -gt 0 ]]; do + case "$1" in + --framework) framework="$2"; shift 2 ;; + *) forwarded+=("$1"); shift ;; + esac + done + + case "$framework" in + lm-eval|lm_eval) run_lm_eval "${forwarded[@]}" ;; + lighteval) run_lighteval_eval "${forwarded[@]}" ;; + *) echo "Unknown framework '${framework}'"; return 1 ;; + esac +} \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh index 63dcf76e1..777aa2c2d 100644 --- a/benchmarks/gptoss_fp4_mi300x_docker.sh +++ b/benchmarks/gptoss_fp4_mi300x_docker.sh @@ -25,6 +25,7 @@ export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +MODEL_NAME=${MODEL##*/} set -x vllm serve $MODEL --port $PORT \ @@ -36,6 +37,7 @@ vllm serve $MODEL --port $PORT \ --block-size=64 \ --no-enable-prefix-caching \ --disable-log-requests \ +--served-model-name $MODEL_NAME \ --async-scheduling > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -47,7 +49,8 @@ source "$(dirname "$0")/benchmark_lib.sh" wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" run_benchmark_serving \ - --model "$MODEL" \ + --model "$MODEL_NAME" \ + --tokenizer "$MODEL" \ --port "$PORT" \ --backend vllm \ --input-len "$ISL" \ @@ -59,6 +62,7 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -run_lm_eval --port "$PORT" +run_eval --framework lm-eval --port "$PORT" +run_eval --framework lighteval --task gsm8k --num-fewshot 5 append_lm_eval_summary set +x \ No newline at end of file From 5ef76ef40780e39c6e238541b3abd17cce3c413a Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Thu, 20 Nov 2025 21:45:28 -0600 Subject: [PATCH 091/214] Lighteval 1.75 --- .github/workflows/eval-gms8k.yml | 10 +++++++--- benchmarks/benchmark_lib.sh | 6 +++--- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 9ed618cda..cffb7277c 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -3,6 +3,11 @@ name: Eval - GSM8K (PoC) on: workflow_dispatch: inputs: + exp-name: + description: "Experiment name (prefix selects docker script)" + required: false + type: string + default: "gptoss_gsm8k_poc" image: description: "Serving image" required: false @@ -36,7 +41,7 @@ on: push: paths: - '.github/workflows/eval-gms8k.yml' - - '.github/workflows/eval-tmpl.yml' + - '.github/workflows/eval-tmpl.yml' jobs: eval: @@ -48,7 +53,7 @@ jobs: model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm precision: fp4 - exp-name: dsr1_gsm8k_poc + exp-name: ${{ inputs.exp-name || 'gptoss_gsm8k_poc' }} tp: '8' ep: '1' dp-attn: false @@ -56,4 +61,3 @@ jobs: eval-task: gsm8k num-fewshot: ${{ inputs.num_fewshot || '5' }} limit: ${{ inputs.limit || '200' }} - diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index bf96c8bda..9c33df709 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -274,7 +274,7 @@ run_lm_eval() { --num_fewshot "${num_fewshot}" \ --batch_size "${batch_size}" \ --output_path "/workspace/${results_dir}" \ - --model_args "model=${MODEL},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=,max_retries=3,num_concurrent=32,tokenized_requests=False" \ + --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=,max_retries=3,num_concurrent=32,tokenized_requests=False" \ --gen_kwargs "max_tokens=${gen_max_tokens},temperature=${temperature},top_p=${top_p}" set +x } @@ -488,9 +488,9 @@ run_lighteval_eval() { _patch_lighteval_litellm # Prefer OPENAI_MODEL_NAME, then EVAL_MODEL_NAME, then MODEL - local model_name="${EVAL_MODEL_NAME:-${OPENAI_MODEL_NAME:-${MODEL}}}" + local model_name="${MODEL_NAME}" if [[ -z "$model_name" ]]; then - echo "Error: EVAL_MODEL_NAME / OPENAI_MODEL_NAME / MODEL not set for lighteval." >&2 + echo "Error: MODEL not set for lighteval." >&2 return 1 fi From 30812413d38e632fad6109647d3027d704c4fd03 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Thu, 20 Nov 2025 21:59:36 -0600 Subject: [PATCH 092/214] Lighteval Mi325x --- .github/workflows/eval-gms8k.yml | 2 +- benchmarks/gptoss_fp4_b200_docker.sh | 12 ++++++++---- benchmarks/gptoss_fp4_mi325x_slurm.sh | 13 +++++++------ 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index cffb7277c..0fca89d5b 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -48,7 +48,7 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: mi300x-amd_0 + runner: mi325x-tw_1 image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh index 8b5b6c881..92e785663 100644 --- a/benchmarks/gptoss_fp4_b200_docker.sh +++ b/benchmarks/gptoss_fp4_b200_docker.sh @@ -45,11 +45,12 @@ export PYTHONNOUSERSITE=1 export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +MODEL_NAME=${MODEL##*/} set -x vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \ --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs 512 \ ---disable-log-requests > $SERVER_LOG 2>&1 & +--disable-log-requests --served-model-name $MODEL_NAME > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -62,7 +63,8 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S pip install -q datasets pandas run_benchmark_serving \ - --model "$MODEL" \ + --model "$MODEL_NAME" \ + --tokenizer "$MODEL" \ --port "$PORT" \ --backend vllm \ --input-len "$ISL" \ @@ -74,5 +76,7 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -run_lm_eval --port "$PORT" -append_lm_eval_summary \ No newline at end of file +run_eval --framework lm-eval --port "$PORT" +run_eval --framework lighteval --task gsm8k --num-fewshot 5 +append_lm_eval_summary +set +x \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh index 4219d0662..3394bcc04 100644 --- a/benchmarks/gptoss_fp4_mi325x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh @@ -33,9 +33,8 @@ fi export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 +MODEL_NAME=${MODEL##*/} -# -## Start up vllm server set -x vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ @@ -46,8 +45,8 @@ vllm serve $MODEL --port $PORT \ --block-size=64 \ --no-enable-prefix-caching \ --disable-log-requests \ ---async-scheduling \ -> $SERVER_LOG 2>&1 & +--served-model-name $MODEL_NAME \ +--async-scheduling > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -58,7 +57,8 @@ source "$(dirname "$0")/benchmark_lib.sh" wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" run_benchmark_serving \ - --model "$MODEL" \ + --model "$MODEL_NAME" \ + --tokenizer "$MODEL" \ --port "$PORT" \ --backend vllm \ --input-len "$ISL" \ @@ -70,6 +70,7 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -run_lm_eval --port "$PORT" +run_eval --framework lm-eval --port "$PORT" +run_eval --framework lighteval --task gsm8k --num-fewshot 5 append_lm_eval_summary set +x \ No newline at end of file From f182319a122bb2482dc0f31bfbc6c2aed516b493 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 21 Nov 2025 07:19:13 -0600 Subject: [PATCH 093/214] Lighteval Mi300x CR --- .github/workflows/eval-gms8k.yml | 2 +- benchmarks/benchmark_lib.sh | 32 ++++++++++++++++++++++---------- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 0fca89d5b..fc284160c 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -48,7 +48,7 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: mi325x-tw_1 + runner: mi300x-cr_0 image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 9c33df709..efddc3230 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -283,16 +283,28 @@ append_lm_eval_summary() { set +x local results_dir="${EVAL_RESULT_DIR:-eval_out}" local task="${EVAL_TASK:-gsm8k}" + # Always render a local summary so the runner can pick it up + local out_dir="/workspace/${results_dir}" + local summary_md="${out_dir}/SUMMARY.md" + mkdir -p "$out_dir" || true + + python3 utils/lm_eval_to_md.py \ + --results-dir "$out_dir" \ + --task "${task}" \ + --framework "${FRAMEWORK}" \ + --precision "${PRECISION}" \ + --tp "${TP:-1}" \ + --ep "${EP_SIZE:-1}" \ + --dp-attention "${DP_ATTENTION:-false}" \ + > "$summary_md" || true + + # If running inside a GitHub Actions step on this same machine, append there too if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then - python3 utils/lm_eval_to_md.py \ - --results-dir "/workspace/${results_dir}" \ - --task "${task}" \ - --framework "${FRAMEWORK}" \ - --precision "${PRECISION}" \ - --tp "${TP:-1}" \ - --ep "${EP_SIZE:-1}" \ - --dp-attention "${DP_ATTENTION:-false}" \ - >> "$GITHUB_STEP_SUMMARY" || true + local GH_SUM_DIR + GH_SUM_DIR="$(dirname "$GITHUB_STEP_SUMMARY")" + if [ -d "$GH_SUM_DIR" ] && [ -w "$GH_SUM_DIR" ]; then + cat "$summary_md" >> "$GITHUB_STEP_SUMMARY" || true + fi fi } @@ -538,4 +550,4 @@ run_eval() { lighteval) run_lighteval_eval "${forwarded[@]}" ;; *) echo "Unknown framework '${framework}'"; return 1 ;; esac -} \ No newline at end of file +} From 5ba2cf2691cc4af5832ddf6c9fcc8a513e209d0e Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 21 Nov 2025 07:51:23 -0600 Subject: [PATCH 094/214] Lighteval Mi355x amd --- .github/workflows/eval-gms8k.yml | 2 +- benchmarks/gptoss_fp4_mi355x_docker.sh | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index fc284160c..1ef8ea3bf 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -48,7 +48,7 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: mi300x-cr_0 + runner: mi355x-amd_4 image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh index f63cc9960..a413acd69 100644 --- a/benchmarks/gptoss_fp4_mi355x_docker.sh +++ b/benchmarks/gptoss_fp4_mi355x_docker.sh @@ -23,6 +23,7 @@ export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_FUSED_MOE_A16W4=1 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +MODEL_NAME=${MODEL##*/} set -x vllm serve $MODEL --port $PORT \ @@ -34,6 +35,7 @@ vllm serve $MODEL --port $PORT \ --block-size=64 \ --no-enable-prefix-caching \ --disable-log-requests \ +--served-model-name $MODEL_NAME \ --async-scheduling > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -45,7 +47,8 @@ source "$(dirname "$0")/benchmark_lib.sh" wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" run_benchmark_serving \ - --model "$MODEL" \ + --model "$MODEL_NAME" \ + --tokenizer "$MODEL" \ --port "$PORT" \ --backend vllm \ --input-len "$ISL" \ @@ -57,6 +60,7 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -run_lm_eval --port "$PORT" +run_eval --framework lm-eval --port "$PORT" +run_eval --framework lighteval --task gsm8k --num-fewshot 5 append_lm_eval_summary set +x \ No newline at end of file From 5bf69ab6d9678c98e10c011abe579f79e4ee8fab Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 21 Nov 2025 07:54:46 -0600 Subject: [PATCH 095/214] Lighteval b200_nvd --- .github/workflows/eval-gms8k.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 1ef8ea3bf..60551abf5 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -48,8 +48,8 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: mi355x-amd_4 - image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }} + runner: b200-nvd_0 + image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm precision: fp4 From f862689b444999f4f6af87d15ac1239a748db263 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 21 Nov 2025 08:00:44 -0600 Subject: [PATCH 096/214] Lighteval h200_cr0 --- .github/workflows/eval-gms8k.yml | 2 +- benchmarks/gptoss_fp4_b200_docker.sh | 10 +++++++--- benchmarks/gptoss_fp4_h100_docker.sh | 14 +++++++++----- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 60551abf5..ee15335e2 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -48,7 +48,7 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: b200-nvd_0 + runner: h100-cr_0 image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh index 92e785663..1f4679383 100644 --- a/benchmarks/gptoss_fp4_b200_docker.sh +++ b/benchmarks/gptoss_fp4_b200_docker.sh @@ -48,9 +48,13 @@ SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) MODEL_NAME=${MODEL##*/} set -x -vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \ ---gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs 512 \ ---disable-log-requests --served-model-name $MODEL_NAME > $SERVER_LOG 2>&1 & +vllm serve $MODEL --host 0.0.0.0 --port $PORT \ +--config config.yaml \ +--gpu-memory-utilization 0.9 \ +--tensor-parallel-size $TP \ +--max-num-seqs 512 \ +--disable-log-requests \ +--served-model-name $MODEL_NAME > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index 212059b04..e9c1cfc5a 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -26,15 +26,16 @@ EOF export PYTHONNOUSERSITE=1 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +MODEL_NAME=${MODEL##*/} -# Start server in the background, shld be openai/gpt-oss-120b set -x vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --config config.yaml \ --gpu-memory-utilization=0.9 \ --tensor-parallel-size=$TP \ --max-num-seqs=$CONC \ ---disable-log-requests > $SERVER_LOG 2>&1 & +--disable-log-requests \ +--served-model-name $MODEL_NAME > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -47,7 +48,8 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S pip install -q datasets pandas run_benchmark_serving \ - --model "$MODEL" \ + --model "$MODEL_NAME" \ + --tokenizer "$MODEL" \ --port "$PORT" \ --backend vllm \ --input-len "$ISL" \ @@ -59,5 +61,7 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -run_lm_eval --port "$PORT" -append_lm_eval_summary \ No newline at end of file +run_eval --framework lm-eval --port "$PORT" +run_eval --framework lighteval --task gsm8k --num-fewshot 5 +append_lm_eval_summary +set +x \ No newline at end of file From c3df519fea9019208f1c04f4a8bfba4851c3c949 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 21 Nov 2025 08:03:45 -0600 Subject: [PATCH 097/214] Lighteval h200-nb_1 --- .github/workflows/eval-gms8k.yml | 2 +- benchmarks/gptoss_fp4_h200_slurm.sh | 20 ++++++++++++++------ 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index ee15335e2..8cc9a6e42 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -48,7 +48,7 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: h100-cr_0 + runner: h200-nb_1 image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh index 2c18d4d6a..9906b2fa5 100644 --- a/benchmarks/gptoss_fp4_h200_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_slurm.sh @@ -41,12 +41,17 @@ EOF SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) export TORCH_CUDA_ARCH_LIST="9.0" PORT=$(( 8888 + $PORT_OFFSET )) +MODEL_NAME=${MODEL##*/} export TORCH_CUDA_ARCH_LIST="9.0" -PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \ - --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC \ - --disable-log-requests > $SERVER_LOG 2>&1 & +PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT \ + --config config.yaml \ + --gpu-memory-utilization 0.9 \ + --tensor-parallel-size $TP \ + --max-num-seqs $CONC \ + --disable-log-requests \ + --served-model-name $MODEL_NAME > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -57,7 +62,8 @@ source "$(dirname "$0")/benchmark_lib.sh" wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" run_benchmark_serving \ - --model "$MODEL" \ + --model "$MODEL_NAME" \ + --tokenizer "$MODEL" \ --port "$PORT" \ --backend vllm \ --input-len "$ISL" \ @@ -69,5 +75,7 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -run_lm_eval --port "$PORT" -append_lm_eval_summary \ No newline at end of file +run_eval --framework lm-eval --port "$PORT" +run_eval --framework lighteval --task gsm8k --num-fewshot 5 +append_lm_eval_summary +set +x \ No newline at end of file From c1edb9ac6c6a742fe61653a599c6fabc65ac8ece Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 21 Nov 2025 08:06:32 -0600 Subject: [PATCH 098/214] Lighteval h100-cw_1 --- .github/workflows/eval-gms8k.yml | 2 +- benchmarks/gptoss_fp4_h100_slurm.sh | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 8cc9a6e42..e980da0aa 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -48,7 +48,7 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: h200-nb_1 + runner: h100-cw_1 image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh index b463a8aaf..394d68bc1 100644 --- a/benchmarks/gptoss_fp4_h100_slurm.sh +++ b/benchmarks/gptoss_fp4_h100_slurm.sh @@ -27,6 +27,7 @@ EOF SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) export TORCH_CUDA_ARCH_LIST="9.0" PORT=${PORT:-8888} +MODEL_NAME=${MODEL##*/} set -x PYTHONNOUSERSITE=1 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ @@ -35,7 +36,7 @@ PYTHONNOUSERSITE=1 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --tensor-parallel-size=$TP \ --max-num-seqs=$CONC \ --disable-log-requests \ - > $SERVER_LOG 2>&1 & + --served-model-name $MODEL_NAME > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -48,7 +49,8 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S pip install -q datasets pandas run_benchmark_serving \ - --model "$MODEL" \ + --model "$MODEL_NAME" \ + --tokenizer "$MODEL" \ --port "$PORT" \ --backend vllm \ --input-len "$ISL" \ @@ -60,5 +62,7 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -run_lm_eval --port "$PORT" +run_eval --framework lm-eval --port "$PORT" +run_eval --framework lighteval --task gsm8k --num-fewshot 5 append_lm_eval_summary +set +x From d21826b355a58a5b34ecf1dfe59f2b196be42ea0 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 21 Nov 2025 22:41:53 -0600 Subject: [PATCH 099/214] Error reproduction --- .github/workflows/eval-gms8k.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index e980da0aa..dfa643398 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -47,8 +47,8 @@ jobs: eval: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit - with: - runner: h100-cw_1 + with: + runner: h200-nb_1 image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm From abdad78c7463e38610a387221ca5a0ac11c5a2ea Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 21 Nov 2025 23:02:36 -0600 Subject: [PATCH 100/214] Error file removal --- benchmarks/benchmark_lib.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index efddc3230..e9c2f20d1 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -538,6 +538,10 @@ run_eval() { local framework="${EVAL_FRAMEWORK:-lm-eval}" local forwarded=() + # Defensive cleanup: remove any LiteLLM cache in the repo workspace so + # subsequent steps (e.g., actions/checkout) won't hit permission issues. + rm -rf .litellm_cache 2>/dev/null || true + while [[ $# -gt 0 ]]; do case "$1" in --framework) framework="$2"; shift 2 ;; @@ -550,4 +554,7 @@ run_eval() { lighteval) run_lighteval_eval "${forwarded[@]}" ;; *) echo "Unknown framework '${framework}'"; return 1 ;; esac + + # Clean up again after eval, in case the tool recreated it. + rm -rf .litellm_cache 2>/dev/null || true } From bd3653035bd9f0da0f1f81246dee5a2fbb37d8e1 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 22 Nov 2025 15:52:55 -0600 Subject: [PATCH 101/214] error reproducibility --- .github/workflows/eval-gms8k.yml | 6 +++--- benchmarks/benchmark_lib.sh | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index dfa643398..cffb7277c 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -47,9 +47,9 @@ jobs: eval: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit - with: - runner: h200-nb_1 - image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} + with: + runner: mi300x-amd_0 + image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm precision: fp4 diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index e9c2f20d1..52c972fc4 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -540,7 +540,7 @@ run_eval() { # Defensive cleanup: remove any LiteLLM cache in the repo workspace so # subsequent steps (e.g., actions/checkout) won't hit permission issues. - rm -rf .litellm_cache 2>/dev/null || true + #rm -rf .litellm_cache 2>/dev/null || true while [[ $# -gt 0 ]]; do case "$1" in @@ -556,5 +556,5 @@ run_eval() { esac # Clean up again after eval, in case the tool recreated it. - rm -rf .litellm_cache 2>/dev/null || true + #rm -rf .litellm_cache 2>/dev/null || true } From a0434b1ea5af218d9115281cd2cb483909d9afb8 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 22 Nov 2025 16:26:29 -0600 Subject: [PATCH 102/214] should NOT error reproduce --- .github/workflows/eval-tmpl.yml | 3 ++- benchmarks/benchmark_lib.sh | 8 ++------ 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml index 2d2e8404a..2d21820c6 100644 --- a/.github/workflows/eval-tmpl.yml +++ b/.github/workflows/eval-tmpl.yml @@ -113,6 +113,7 @@ jobs: fi # Best-effort cleanup of prior eval outputs; do not block safe_timeout sudo rm -rf /home/ubuntu/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out/openai__gpt-oss-120b || true + safe_timeout sudo rm -rf /home/ubuntu/actions-runner/_work/InferenceMAX/InferenceMAX/.litellm_cache/cache.db || true if command -v squeue >/dev/null 2>&1; then echo "[Slurm] Cleaning up resources ..." @@ -138,7 +139,7 @@ jobs: with: fetch-depth: 0 # Avoid aggressive workspace deletion if stale, rely on git reset/clean later - clean: false + clean: true - name: Launch eval via runner script env: diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 52c972fc4..c91e95707 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -515,7 +515,7 @@ run_lighteval_eval() { local base_url="http://0.0.0.0:${port}/v1" export OPENAI_API_KEY="${OPENAI_API_KEY:-EMPTY}" - local MODEL_ARGS="model_name=${lite_model},base_url=${base_url},api_key=${OPENAI_API_KEY}" + local MODEL_ARGS="model_name=${lite_model},base_url=${base_url},api_key=${OPENAI_API_KEY},generation_parameters={temperature:1.0}" local TASK_SPEC="${task}|${num_fewshot}" set -x @@ -538,10 +538,6 @@ run_eval() { local framework="${EVAL_FRAMEWORK:-lm-eval}" local forwarded=() - # Defensive cleanup: remove any LiteLLM cache in the repo workspace so - # subsequent steps (e.g., actions/checkout) won't hit permission issues. - #rm -rf .litellm_cache 2>/dev/null || true - while [[ $# -gt 0 ]]; do case "$1" in --framework) framework="$2"; shift 2 ;; @@ -556,5 +552,5 @@ run_eval() { esac # Clean up again after eval, in case the tool recreated it. - #rm -rf .litellm_cache 2>/dev/null || true + rm -rf .litellm_cache 2>/dev/null || true } From f56a3117da9db7aa2367f1366a19dab39a717a86 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 22 Nov 2025 16:31:02 -0600 Subject: [PATCH 103/214] should NOT error reproduce --- .github/workflows/eval-tmpl.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml index 2d21820c6..cbfd04faf 100644 --- a/.github/workflows/eval-tmpl.yml +++ b/.github/workflows/eval-tmpl.yml @@ -80,6 +80,7 @@ jobs: steps: - name: Resource cleanup run: | + sudo rm -rf /home/kimbosemianalysis/actions-runner/_work/InferenceMAX/InferenceMAX/.litellm_cache/ # Helper to avoid indefinite hangs on flaky tools (Docker/Slurm) safe_timeout() { if command -v timeout >/dev/null 2>&1; then From 27bd2de9691dab1c253c77f0ae7701dbc4a517fa Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 22 Nov 2025 16:33:34 -0600 Subject: [PATCH 104/214] should NOT error reproduce --- .github/workflows/eval-tmpl.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml index cbfd04faf..bb9ec6fe0 100644 --- a/.github/workflows/eval-tmpl.yml +++ b/.github/workflows/eval-tmpl.yml @@ -81,6 +81,7 @@ jobs: - name: Resource cleanup run: | sudo rm -rf /home/kimbosemianalysis/actions-runner/_work/InferenceMAX/InferenceMAX/.litellm_cache/ + sudo rm -rf /home/kimbosemianalysis/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out/ # Helper to avoid indefinite hangs on flaky tools (Docker/Slurm) safe_timeout() { if command -v timeout >/dev/null 2>&1; then From c058b1663c4bf430fbf194846a7dce4043a51c6f Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 22 Nov 2025 16:36:41 -0600 Subject: [PATCH 105/214] should NOT error reproduce --- .github/workflows/eval-tmpl.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml index bb9ec6fe0..1930f0d2c 100644 --- a/.github/workflows/eval-tmpl.yml +++ b/.github/workflows/eval-tmpl.yml @@ -82,6 +82,7 @@ jobs: run: | sudo rm -rf /home/kimbosemianalysis/actions-runner/_work/InferenceMAX/InferenceMAX/.litellm_cache/ sudo rm -rf /home/kimbosemianalysis/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out/ + sudo rm -rf /home/kimbosemianalysis/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out_lighteval # Helper to avoid indefinite hangs on flaky tools (Docker/Slurm) safe_timeout() { if command -v timeout >/dev/null 2>&1; then From 2e3691449a993c0e19d7a27146be18335b49ba46 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 23 Nov 2025 18:17:42 +0800 Subject: [PATCH 106/214] Double check other runner --- .github/workflows/eval-gms8k.yml | 4 ++-- benchmarks/benchmark_lib.sh | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index cffb7277c..8cc9a6e42 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -48,8 +48,8 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: mi300x-amd_0 - image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }} + runner: h200-nb_1 + image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm precision: fp4 diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index c91e95707..8692ec40d 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -551,6 +551,4 @@ run_eval() { *) echo "Unknown framework '${framework}'"; return 1 ;; esac - # Clean up again after eval, in case the tool recreated it. - rm -rf .litellm_cache 2>/dev/null || true } From d2cf0fbbeb12773db6edc19769ed2c28f301e0ce Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 23 Nov 2025 18:21:35 +0800 Subject: [PATCH 107/214] Cleanup MI300x_AMD --- .github/workflows/eval-gms8k.yml | 4 +- benchmarks/benchmark_lib.sh | 51 ++++++++++++++++++++++++++ benchmarks/gptoss_fp4_mi300x_docker.sh | 3 ++ 3 files changed, 56 insertions(+), 2 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 8cc9a6e42..cffb7277c 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -48,8 +48,8 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: h200-nb_1 - image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} + runner: mi300x-amd_0 + image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm precision: fp4 diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 8692ec40d..a52e7cc17 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -552,3 +552,54 @@ run_eval() { esac } + +# ...existing code... + +# ------------------------------ +# Cleanup utilities +# ------------------------------ + +# Clean up evaluation and cache artifacts +# This function should be called at the end of benchmark/eval scripts +cleanup_eval_artifacts() { + set +x + echo "[Cleanup] Removing evaluation artifacts and cache directories..." + + # Clean up litellm cache + if [ -d "/workspace/.litellm_cache" ]; then + rm -rf /workspace/.litellm_cache || true + echo "[Cleanup] Removed .litellm_cache" + fi + + # Clean up eval output directories + for dir in /workspace/eval_out* /workspace/.cache; do + if [ -d "$dir" ]; then + rm -rf "$dir" || true + echo "[Cleanup] Removed $dir" + fi + done + + # Clean up temporary benchmark directories + if [ -n "${BENCH_SERVING_DIR:-}" ] && [ -d "$BENCH_SERVING_DIR" ]; then + rm -rf "$BENCH_SERVING_DIR" || true + echo "[Cleanup] Removed benchmark serving temp dir" + fi + + # Clean up Python cache + find /workspace -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true + find /workspace -type f -name "*.pyc" -delete 2>/dev/null || true + + # Fix permissions for any remaining files (in case cleanup is run without sudo) + chmod -R 777 /workspace/.litellm_cache 2>/dev/null || true + chmod -R 777 /workspace/eval_out* 2>/dev/null || true + + echo "[Cleanup] Artifact cleanup complete" + set -x +} + +# Trap to ensure cleanup runs even if script fails +# Call this at the start of your benchmark scripts: +# trap cleanup_eval_artifacts EXIT +setup_cleanup_trap() { + trap cleanup_eval_artifacts EXIT INT TERM +} diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh index 777aa2c2d..31ac5ba56 100644 --- a/benchmarks/gptoss_fp4_mi300x_docker.sh +++ b/benchmarks/gptoss_fp4_mi300x_docker.sh @@ -61,6 +61,9 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ +# Auto cleanup on exit +setup_cleanup_trap + # After throughput, run evaluation (defaults to GSM8K) run_eval --framework lm-eval --port "$PORT" run_eval --framework lighteval --task gsm8k --num-fewshot 5 From 0a8901ab64e3d4f916189c5a927683aca0f90a17 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 23 Nov 2025 18:37:43 +0800 Subject: [PATCH 108/214] Cleanup MI300x_AMD --- .github/workflows/eval-tmpl.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml index 1930f0d2c..2d21820c6 100644 --- a/.github/workflows/eval-tmpl.yml +++ b/.github/workflows/eval-tmpl.yml @@ -80,9 +80,6 @@ jobs: steps: - name: Resource cleanup run: | - sudo rm -rf /home/kimbosemianalysis/actions-runner/_work/InferenceMAX/InferenceMAX/.litellm_cache/ - sudo rm -rf /home/kimbosemianalysis/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out/ - sudo rm -rf /home/kimbosemianalysis/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out_lighteval # Helper to avoid indefinite hangs on flaky tools (Docker/Slurm) safe_timeout() { if command -v timeout >/dev/null 2>&1; then From afd304fab4c7c5658700c8c014d68c16c4d0cfd2 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 23 Nov 2025 18:43:05 +0800 Subject: [PATCH 109/214] Cleanup MI300x_AMD --- .github/workflows/eval-tmpl.yml | 8 ++++ benchmarks/benchmark_lib.sh | 54 +------------------------- benchmarks/gptoss_fp4_mi300x_docker.sh | 3 -- 3 files changed, 10 insertions(+), 55 deletions(-) diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml index 2d21820c6..195d0a158 100644 --- a/.github/workflows/eval-tmpl.yml +++ b/.github/workflows/eval-tmpl.yml @@ -159,3 +159,11 @@ jobs: ${{ env.EVAL_RESULT_DIR }}/ ${{ env.EVAL_RESULT_DIR }}/* ${{ env.EVAL_RESULT_DIR }}/** + + - name: Resource cleanup + run: | + safe_timeout sudo rm -rf \ + "${{ github.workspace }}/.litellm_cache" \ + "${{ github.workspace }}/eval_out"* \ + "${{ github.workspace }}/.cache" \ + || true diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index a52e7cc17..aaa9dbf88 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -523,6 +523,7 @@ run_lighteval_eval() { "${MODEL_ARGS}" \ "${TASK_SPEC}" \ --output-dir "/workspace/${results_dir}" \ + --use-chat-template \ --max-samples "${max_samples}" \ --remove-reasoning-tags set +x @@ -551,55 +552,4 @@ run_eval() { *) echo "Unknown framework '${framework}'"; return 1 ;; esac -} - -# ...existing code... - -# ------------------------------ -# Cleanup utilities -# ------------------------------ - -# Clean up evaluation and cache artifacts -# This function should be called at the end of benchmark/eval scripts -cleanup_eval_artifacts() { - set +x - echo "[Cleanup] Removing evaluation artifacts and cache directories..." - - # Clean up litellm cache - if [ -d "/workspace/.litellm_cache" ]; then - rm -rf /workspace/.litellm_cache || true - echo "[Cleanup] Removed .litellm_cache" - fi - - # Clean up eval output directories - for dir in /workspace/eval_out* /workspace/.cache; do - if [ -d "$dir" ]; then - rm -rf "$dir" || true - echo "[Cleanup] Removed $dir" - fi - done - - # Clean up temporary benchmark directories - if [ -n "${BENCH_SERVING_DIR:-}" ] && [ -d "$BENCH_SERVING_DIR" ]; then - rm -rf "$BENCH_SERVING_DIR" || true - echo "[Cleanup] Removed benchmark serving temp dir" - fi - - # Clean up Python cache - find /workspace -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true - find /workspace -type f -name "*.pyc" -delete 2>/dev/null || true - - # Fix permissions for any remaining files (in case cleanup is run without sudo) - chmod -R 777 /workspace/.litellm_cache 2>/dev/null || true - chmod -R 777 /workspace/eval_out* 2>/dev/null || true - - echo "[Cleanup] Artifact cleanup complete" - set -x -} - -# Trap to ensure cleanup runs even if script fails -# Call this at the start of your benchmark scripts: -# trap cleanup_eval_artifacts EXIT -setup_cleanup_trap() { - trap cleanup_eval_artifacts EXIT INT TERM -} +} \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh index 31ac5ba56..777aa2c2d 100644 --- a/benchmarks/gptoss_fp4_mi300x_docker.sh +++ b/benchmarks/gptoss_fp4_mi300x_docker.sh @@ -61,9 +61,6 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ -# Auto cleanup on exit -setup_cleanup_trap - # After throughput, run evaluation (defaults to GSM8K) run_eval --framework lm-eval --port "$PORT" run_eval --framework lighteval --task gsm8k --num-fewshot 5 From ef2ee409eec0576403aeaac19212d25fc13712b2 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 23 Nov 2025 19:09:33 +0800 Subject: [PATCH 110/214] Cleanup MI300x_AMD MUST WORK --- .github/workflows/eval-tmpl.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml index 195d0a158..3b8885124 100644 --- a/.github/workflows/eval-tmpl.yml +++ b/.github/workflows/eval-tmpl.yml @@ -80,6 +80,9 @@ jobs: steps: - name: Resource cleanup run: | + sudo rm -rf /home/kimbosemianalysis/actions-runner/_work/InferenceMAX/InferenceMAX/.litellm_cache/ + sudo rm -rf /home/kimbosemianalysis/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out/ + sudo rm -rf /home/kimbosemianalysis/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out_lighteval # Helper to avoid indefinite hangs on flaky tools (Docker/Slurm) safe_timeout() { if command -v timeout >/dev/null 2>&1; then @@ -159,11 +162,11 @@ jobs: ${{ env.EVAL_RESULT_DIR }}/ ${{ env.EVAL_RESULT_DIR }}/* ${{ env.EVAL_RESULT_DIR }}/** - + - name: Resource cleanup run: | - safe_timeout sudo rm -rf \ + sudo rm -rf \ "${{ github.workspace }}/.litellm_cache" \ "${{ github.workspace }}/eval_out"* \ "${{ github.workspace }}/.cache" \ - || true + || true \ No newline at end of file From 379069623766a95963fdff2e4acb554fc489f2a4 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 23 Nov 2025 19:18:58 +0800 Subject: [PATCH 111/214] works --- .github/workflows/eval-tmpl.yml | 5 +---- benchmarks/benchmark_lib.sh | 1 - 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml index 3b8885124..27f9b8b3f 100644 --- a/.github/workflows/eval-tmpl.yml +++ b/.github/workflows/eval-tmpl.yml @@ -80,9 +80,6 @@ jobs: steps: - name: Resource cleanup run: | - sudo rm -rf /home/kimbosemianalysis/actions-runner/_work/InferenceMAX/InferenceMAX/.litellm_cache/ - sudo rm -rf /home/kimbosemianalysis/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out/ - sudo rm -rf /home/kimbosemianalysis/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out_lighteval # Helper to avoid indefinite hangs on flaky tools (Docker/Slurm) safe_timeout() { if command -v timeout >/dev/null 2>&1; then @@ -162,7 +159,7 @@ jobs: ${{ env.EVAL_RESULT_DIR }}/ ${{ env.EVAL_RESULT_DIR }}/* ${{ env.EVAL_RESULT_DIR }}/** - + - name: Resource cleanup run: | sudo rm -rf \ diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index aaa9dbf88..c0b8bbb0b 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -523,7 +523,6 @@ run_lighteval_eval() { "${MODEL_ARGS}" \ "${TASK_SPEC}" \ --output-dir "/workspace/${results_dir}" \ - --use-chat-template \ --max-samples "${max_samples}" \ --remove-reasoning-tags set +x From 92f244cb7b45fa8631debd3e64194d6916c24da1 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Tue, 25 Nov 2025 13:17:07 +0800 Subject: [PATCH 112/214] Working lighteval --- .github/workflows/eval-gms8k.yml | 8 ++-- .github/workflows/eval-tmpl.yml | 7 ++-- benchmarks/benchmark_lib.sh | 63 +++++++++++++++++--------------- 3 files changed, 40 insertions(+), 38 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index cffb7277c..34fe89d3c 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -22,7 +22,7 @@ on: description: "Tensor Parallel Size" required: false type: string - default: "8" + default: "2" port: description: "Server port" required: false @@ -48,13 +48,13 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: mi300x-amd_0 - image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }} + runner: h200-nb_1 + image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm precision: fp4 exp-name: ${{ inputs.exp-name || 'gptoss_gsm8k_poc' }} - tp: '8' + tp: '2' ep: '1' dp-attn: false port: ${{ inputs.port || '8888' }} diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml index 27f9b8b3f..63e568dd2 100644 --- a/.github/workflows/eval-tmpl.yml +++ b/.github/workflows/eval-tmpl.yml @@ -64,8 +64,7 @@ env: NUM_FEWSHOT: ${{ inputs['num-fewshot'] }} LIMIT: ${{ inputs.limit }} EVAL_RESULT_DIR: eval_out - # Server-side concurrency default (used by some server scripts) - CONC: '8' + CONC: '4' MAX_MODEL_LEN: '8192' ISL: 1024 OSL: 1024 @@ -112,8 +111,8 @@ jobs: echo "[Docker] skipping docker cleanup on host $host" fi # Best-effort cleanup of prior eval outputs; do not block - safe_timeout sudo rm -rf /home/ubuntu/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out/openai__gpt-oss-120b || true - safe_timeout sudo rm -rf /home/ubuntu/actions-runner/_work/InferenceMAX/InferenceMAX/.litellm_cache/cache.db || true + sudo rm -rf /home/ubuntu/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out/openai__gpt-oss-120b || true + sudo rm -rf /home/ubuntu/actions-runner/_work/InferenceMAX/InferenceMAX/.litellm_cache/cache.db || true if command -v squeue >/dev/null 2>&1; then echo "[Slurm] Cleaning up resources ..." diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index c0b8bbb0b..6b9ac8f65 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -319,6 +319,8 @@ _install_lighteval_deps() { } # Patch lighteval's LiteLLMClient to handle reasoning content and Python name mangling +# 1. Removed "response_format": {"type": "text"}, as it interferred with vllm endpoint +# 2. Concat reasoning with output tokens as sometimes the output is empty. _patch_lighteval_litellm() { set +x local patch_dir @@ -340,13 +342,12 @@ from lighteval.utils.cache_management import cached logger = logging.getLogger(__name__) -# --- Patched __call_api: don't retry when we have reasoning_content --- +# --- Patched __call_api: don't retry when we have reasoning_content, enforce chat template on vLLM and avoid stop interference --- def _patched___call_api(self, prompt, return_logits, max_new_tokens, num_samples, stop_sequence): # noqa: C901 - """Make API call with retries, but don't treat reasoning-only responses as empty.""" from lighteval.models.endpoints.litellm_model import LitellmModelResponse - response = LitellmModelResponse() - stop_sequence = self._prepare_stop_sequence(stop_sequence) + + stop_sequence = None # Important: let the chat template drive turn-taking max_new_tokens = self._prepare_max_new_tokens(max_new_tokens) if return_logits and not self.provider == "openai": @@ -355,15 +356,18 @@ def _patched___call_api(self, prompt, return_logits, max_new_tokens, num_samples kwargs = { "model": self.model, "messages": prompt, - "response_format": {"type": "text"}, "max_tokens": max_new_tokens, "logprobs": return_logits if self.provider == "openai" else None, - "stop": stop_sequence, + "stop": stop_sequence, # disabled for chat "base_url": self.base_url, "api_key": self.api_key, "n": num_samples, "caching": True, "timeout": self.timeout, + # vLLM OpenAI server: ensure chat template is applied and an assistant turn is started + "extra_body": { + "use_chat_template": True + }, } if "o1" in self.model: @@ -381,6 +385,7 @@ def _patched___call_api(self, prompt, return_logits, max_new_tokens, num_samples content = msg.content reasoning = getattr(msg, "reasoning_content", None) + # Accept reasoning-only replies if (not content) and reasoning: return response @@ -388,14 +393,11 @@ def _patched___call_api(self, prompt, return_logits, max_new_tokens, num_samples logger.info("Response is empty, retrying without caching") kwargs["caching"] = False response = litellm.completion(**kwargs) - msg = response.choices[0].message - content = msg.content - reasoning = getattr(msg, "reasoning_content", None) return response except litellm.BadRequestError as e: if "message" in e.__dict__ and "policy" in e.__dict__["message"]: - logger.warning(f"Content filtered. Returning empty response.") + logger.warning("Content filtered. Returning empty response.") return LitellmModelResponse() except Exception as e: wait_time = min(64, self.API_RETRY_SLEEP * (self.API_RETRY_MULTIPLIER**attempt)) @@ -405,10 +407,9 @@ def _patched___call_api(self, prompt, return_logits, max_new_tokens, num_samples logger.error(f"API call failed after {self.API_MAX_RETRY} attempts.") return LitellmModelResponse() -# APPLY PATCH: Must use mangled name because original was private (__call_api) +# APPLY PATCH LiteLLMClient._LiteLLMClient__call_api = _patched___call_api -# --- Patched greedy_until: merge reasoning + content, preserve ordering --- def _greedy_until_impl(self, docs: list[Doc]) -> list[ModelResponse]: dataset = GenerativeTaskDataset(requests=docs, num_dataset_splits=self.DATASET_SPLITS) results: list[ModelResponse] = [] @@ -420,7 +421,8 @@ def _greedy_until_impl(self, docs: list[Doc]) -> list[ModelResponse]: position=0, disable=self.disable_tqdm, ): - contexts = [self.prompt_manager.prepare_prompt_api(doc) for doc in dataset] + # FIX: only build contexts for the current split + contexts = [self.prompt_manager.prepare_prompt_api(doc) for doc in split] max_new_tokens = split[0].generation_size return_logits = split[0].use_logits @@ -430,7 +432,6 @@ def _greedy_until_impl(self, docs: list[Doc]) -> list[ModelResponse]: if num_samples > 1 and self.generation_parameters.temperature == 0: raise ValueError("num_samples > 1 requires temperature > 0") - # CRITICAL FIX: Access the private method via mangled name responses = self._LiteLLMClient__call_api_parallel( contexts, return_logits, @@ -443,28 +444,30 @@ def _greedy_until_impl(self, docs: list[Doc]) -> list[ModelResponse]: raw_contents = [(choice.message.content or "").strip() for choice in response.choices] raw_reasonings = [(getattr(choice.message, "reasoning_content", None) or "").strip() for choice in response.choices] - merged: list[str] = [] + merged_texts: list[str] = [] + reasonings: list[str | None] = [] + for c, r in zip(raw_contents, raw_reasonings): if c and r: - merged.append(r + "\n\n" + c) + merged_texts.append(f"{r}\n\n{c}") elif c: - merged.append(c) + merged_texts.append(c) elif r: - merged.append(r) + merged_texts.append(f"{r}") else: - merged.append("") - - reasonings: list[str | None] = [r if r != "" else None for r in raw_reasonings] - - if not merged or merged[0] is None: - merged = [""] - - cur_response = ModelResponse( - text=merged, - reasonings=reasonings, - input=context, + merged_texts.append("") + reasonings.append(r if r != "" else None) + + if not merged_texts or merged_texts[0] is None: + merged_texts = [""] + + results.append( + ModelResponse( + text=merged_texts, + reasonings=reasonings, + input=context, + ) ) - results.append(cur_response) if len(results) != len(dataset): raise RuntimeError(f"Internal mismatch: {len(results)} outputs vs {len(dataset)} docs.") From 3e30425b4c9c3711d4f835fee4b2056fa3f1f95b Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Tue, 25 Nov 2025 16:11:35 +0800 Subject: [PATCH 113/214] lightevel fix --- .github/workflows/eval-gms8k.yml | 1 + benchmarks/benchmark_lib.sh | 5 ++--- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 34fe89d3c..808ec37bc 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -42,6 +42,7 @@ on: paths: - '.github/workflows/eval-gms8k.yml' - '.github/workflows/eval-tmpl.yml' + - 'benchmarks/benchmark_lib.sh' jobs: eval: diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 6b9ac8f65..020b03953 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -518,7 +518,7 @@ run_lighteval_eval() { local base_url="http://0.0.0.0:${port}/v1" export OPENAI_API_KEY="${OPENAI_API_KEY:-EMPTY}" - local MODEL_ARGS="model_name=${lite_model},base_url=${base_url},api_key=${OPENAI_API_KEY},generation_parameters={temperature:1.0}" + local MODEL_ARGS="model_name=${lite_model},base_url=${base_url},api_key=${OPENAI_API_KEY},generation_parameters={temperature:0.0,max_new_tokens:2056}" local TASK_SPEC="${task}|${num_fewshot}" set -x @@ -526,8 +526,7 @@ run_lighteval_eval() { "${MODEL_ARGS}" \ "${TASK_SPEC}" \ --output-dir "/workspace/${results_dir}" \ - --max-samples "${max_samples}" \ - --remove-reasoning-tags + --max-samples "${max_samples}" set +x } From 0d87ea5063dabb770496e0dea0692e229ef737cb Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Tue, 25 Nov 2025 16:49:17 +0800 Subject: [PATCH 114/214] lighteval test h100-cw_1 --- .github/workflows/eval-gms8k.yml | 2 +- .github/workflows/eval-tmpl.yml | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 808ec37bc..9d06c5623 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -49,7 +49,7 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: h200-nb_1 + runner: h100-cw_1 image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml index 63e568dd2..23bec7006 100644 --- a/.github/workflows/eval-tmpl.yml +++ b/.github/workflows/eval-tmpl.yml @@ -111,9 +111,7 @@ jobs: echo "[Docker] skipping docker cleanup on host $host" fi # Best-effort cleanup of prior eval outputs; do not block - sudo rm -rf /home/ubuntu/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out/openai__gpt-oss-120b || true - sudo rm -rf /home/ubuntu/actions-runner/_work/InferenceMAX/InferenceMAX/.litellm_cache/cache.db || true - + if command -v squeue >/dev/null 2>&1; then echo "[Slurm] Cleaning up resources ..." safe_timeout scancel -u "$USER" || true From 00b1623ee9ff1b9516d1243f041af18d2822109a Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Tue, 25 Nov 2025 16:54:18 +0800 Subject: [PATCH 115/214] lighteval test h100-cr_1 + parsing --- .github/workflows/eval-gms8k.yml | 2 +- utils/lm_eval_to_md.py | 247 ++++++++++++++++++++++++++----- 2 files changed, 208 insertions(+), 41 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 9d06c5623..0951f4254 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -49,7 +49,7 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: h100-cw_1 + runner: h100-cr_1 image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm diff --git a/utils/lm_eval_to_md.py b/utils/lm_eval_to_md.py index dbcc4d88d..0c59bc494 100644 --- a/utils/lm_eval_to_md.py +++ b/utils/lm_eval_to_md.py @@ -1,10 +1,10 @@ #!/usr/bin/env python3 """ -Convert latest lm-evaluation-harness JSON in a results dir into a Markdown table -for GitHub Actions job summary. Prints to stdout. +Convert latest lm-evaluation-harness and/or lighteval JSONs in a results dir +into Markdown tables for GitHub Actions job summary. Prints to stdout. -Usage: - python3 bench_serving/scripts/lm_eval_to_md.py \ +Usage (same as before, works even if FRAMEWORK/PRECISION env vars are empty): + python3 utils/lm_eval_to_md.py \ --results-dir /workspace/eval_out \ --task gsm8k \ --framework vLLM \ @@ -13,25 +13,28 @@ --ep 1 \ --dp-attention false """ -import argparse, json, os, re, sys +import argparse +import json +import os +import re +import sys from collections import Counter from glob import glob +from typing import Optional, Tuple, Dict, Any, List -def find_latest_json(results_dir: str): - paths = [] - for root, _, _ in os.walk(results_dir): - paths.extend(glob(os.path.join(root, "*.json"))) - if not paths: - return None - paths.sort(key=lambda p: os.path.getmtime(p), reverse=True) - return paths[0] + +# ----------------------- +# Helpers +# ----------------------- def pct(x): return f"{x*100:.2f}%" if isinstance(x, (int, float)) else "N/A" + def se(x): return f" \u00B1{(x*100):.2f}%" if isinstance(x, (int, float)) else "" + def gpu_cpu_from_pretty_env(pe: str): if not isinstance(pe, str) or not pe: return "Unknown GPU" @@ -42,20 +45,74 @@ def gpu_cpu_from_pretty_env(pe: str): cpu_line = next((l.split(":", 1)[1].strip() for l in pe.splitlines() if l.startswith("Model name:")), None) return gpu_summary + (f" ({cpu_line})" if cpu_line else "") -def extract_metrics(data: dict, task: str): - # results section can vary across harness versions + +def detect_framework_kind(data: Dict[str, Any]) -> str: + """ + Classify JSON as: + - 'lm-eval' : lm-evaluation-harness style JSON + - 'lighteval' : lighteval JSON + - 'unknown' : anything else + """ + # lm-eval has lm_eval_version + results structure like results["gsm8k"]... [oai_citation:0‡results_2025-11-25T08-30-41.513104.json](sediment://file_000000001658720790705168e4c51783) + if "lm_eval_version" in data or "pretty_env_info" in data: + return "lm-eval" + # lighteval has config_general + config_tasks/results keyed by "|" [oai_citation:1‡results_2025-11-25T08-40-05.199875.json](sediment://file_000000006f3872078dd9c458c614c1f7) + if "config_general" in data and "results" in data: + return "lighteval" + return "unknown" + + +def find_all_jsons(results_dir: str) -> List[str]: + paths = [] + for root, _, _ in os.walk(results_dir): + for name in glob(os.path.join(root, "*.json")): + paths.append(name) + return paths + + +def find_latest_by_kind(results_dir: str) -> Tuple[Optional[str], Optional[str]]: + """ + Scan all JSONs under results_dir and return: + (latest_lm_eval_json_path, latest_lighteval_json_path) + """ + lm_eval_candidates = [] + lighteval_candidates = [] + + for path in find_all_jsons(results_dir): + try: + with open(path, "r") as f: + data = json.load(f) + except Exception: + continue + + kind = detect_framework_kind(data) + mtime = os.path.getmtime(path) + if kind == "lm-eval": + lm_eval_candidates.append((mtime, path)) + elif kind == "lighteval": + lighteval_candidates.append((mtime, path)) + + lm_path = max(lm_eval_candidates, default=(None, None))[1] + le_path = max(lighteval_candidates, default=(None, None))[1] + return lm_path, le_path + + +# ----------------------- +# lm-eval parsing +# ----------------------- + +def extract_lm_eval_metrics(data: Dict[str, Any], task: str) -> Dict[str, Any]: res_all = data.get("results", {}) or {} res = res_all.get(task) if isinstance(res_all, dict) else {} if not res and isinstance(res_all, dict) and res_all: - # fallback to first key if requested task missing any_key = next(iter(res_all.keys())) res = res_all.get(any_key, {}) task = any_key strict = res.get("exact_match,strict-match") - flex = res.get("exact_match,flexible-extract") + flex = res.get("exact_match,flexible-extract") strict_se = res.get("exact_match_stderr,strict-match") - flex_se = res.get("exact_match_stderr,flexible-extract") + flex_se = res.get("exact_match_stderr,flexible-extract") n_eff = None ns = data.get("n-samples") or data.get("n_samples") or {} @@ -64,19 +121,16 @@ def extract_metrics(data: dict, task: str): if isinstance(tdict, dict): n_eff = tdict.get("effective") or tdict.get("n_eff") - # model/fewshot/limit are scattered depending on version model = data.get("model_name") \ or data.get("configs", {}).get(task, {}).get("metadata", {}).get("model") \ or data.get("config", {}).get("model") \ or "" - # k-shot fewshot = None nshot = data.get("n-shot") or data.get("n_shot") or {} if isinstance(nshot, dict): fewshot = nshot.get(task) or nshot.get("gsm8k") - # limit limit = None cfg = data.get("config") or {} if isinstance(cfg, dict): @@ -91,42 +145,155 @@ def extract_metrics(data: dict, task: str): "n_eff": n_eff, "model": model, "fewshot": fewshot, - "limit": limit + "limit": limit, + } + + +def render_lm_eval_section(path: str, + args, + framework_label: str, + precision_label: str) -> Tuple[str, Dict[str, Any]]: + with open(path, "r") as f: + data = json.load(f) + + hardware = gpu_cpu_from_pretty_env(data.get("pretty_env_info", "")) + m = extract_lm_eval_metrics(data, args.task) + + print(f"### {args.task} Evaluation (lm-eval-harness)\n") + print("| Hardware | Framework | Precision | TP | EP | DP Attention | EM Strict | EM Flexible | N (eff) |") + print("|---|---|---:|--:|--:|:--:|--:|--:|--:|") + print( + f"| {hardware} | {framework_label} | {precision_label} | {args.tp} | {args.ep} | " + f"{str(args.dp_attention).lower()} | " + f"{pct(m['strict'])}{se(m['strict_se'])} | " + f"{pct(m['flex'])}{se(m['flex_se'])} | {m['n_eff'] or ''} |" + ) + + lim = m["limit"] + lim_str = str(int(lim)) if isinstance(lim, (int, float)) else (str(lim) if lim is not None else "") + fewshot = m["fewshot"] if m["fewshot"] is not None else "" + print( + f"\n_Model_: `{m['model']}`    " + f"_k-shot_: **{fewshot}**    " + f"_limit_: **{lim_str}** \n" + f"_Source_: `{os.path.basename(path)}`" + ) + return hardware, m + + +# ----------------------- +# lighteval parsing +# ----------------------- + +def extract_lighteval_metrics(data: Dict[str, Any], task_base: str) -> Dict[str, Any]: + res_all = data.get("results", {}) or {} + + # Prefer task-specific key like "gsm8k|5" over "all" [oai_citation:2‡results_2025-11-25T08-40-05.199875.json](sediment://file_000000006f3872078dd9c458c614c1f7) + task_key = None + for k in res_all.keys(): + if k.startswith(task_base): + task_key = k + break + if task_key is None and "all" in res_all: + task_key = "all" + + r = res_all.get(task_key, {}) + em = r.get("extractive_match") + em_se = r.get("extractive_match_stderr") + + # Fewshot & other metadata from config_tasks if available + fewshot = None + cfg_tasks = data.get("config_tasks", {}) + if isinstance(cfg_tasks, dict) and task_key in cfg_tasks: + fewshot = cfg_tasks[task_key].get("num_fewshots") + + # Model name from config_general + cg = data.get("config_general", {}) or {} + model = cg.get("model_name") or cg.get("model_config", {}).get("model_name", "") + + return { + "task": task_key or task_base, + "em": em, + "em_se": em_se, + "fewshot": fewshot, + "model": model, + # lighteval JSON you showed doesn’t expose an obvious effective N; leave blank + "n_eff": None, } + +def render_lighteval_section(path: str, + args, + framework_label: str, + precision_label: str, + hardware_fallback: Optional[str]) -> None: + with open(path, "r") as f: + data = json.load(f) + + m = extract_lighteval_metrics(data, args.task) + hardware = hardware_fallback or "Unknown GPU" + + print(f"### {args.task} Evaluation (lighteval)\n") + print("| Hardware | Framework | Precision | TP | EP | DP Attention | Extractive Match | N (eff) |") + print("|---|---|---:|--:|--:|:--:|--:|--:|") + print( + f"| {hardware} | {framework_label} | {precision_label} | {args.tp} | {args.ep} | " + f"{str(args.dp_attention).lower()} | " + f"{pct(m['em'])}{se(m['em_se'])} | {m['n_eff'] or ''} |" + ) + + fewshot = m["fewshot"] if m["fewshot"] is not None else "" + print( + f"\n_Model_: `{m['model']}`    " + f"_k-shot_: **{fewshot}** \n" + f"_Source_: `{os.path.basename(path)}`" + ) + + +# ----------------------- +# main +# ----------------------- + def main(): ap = argparse.ArgumentParser() ap.add_argument("--results-dir", required=True) ap.add_argument("--task", default="gsm8k") - ap.add_argument("--framework", default=os.environ.get("FRAMEWORK", "vLLM")) - ap.add_argument("--precision", default=os.environ.get("PRECISION", "fp16")) + ap.add_argument("--framework", default=os.environ.get("FRAMEWORK", "")) + ap.add_argument("--precision", default=os.environ.get("PRECISION", "")) ap.add_argument("--tp", default=os.environ.get("TP", "1")) ap.add_argument("--ep", default=os.environ.get("EP_SIZE", "1")) ap.add_argument("--dp-attention", default=os.environ.get("DP_ATTENTION", "false")) args = ap.parse_args() - path = find_latest_json(args.results_dir) - print(f"### {args.task} Evaluation\n") - if not path or not os.path.exists(path): + # Robust defaults if env vars / CLI args are empty + framework_label = args.framework or os.environ.get("FRAMEWORK") or "unknown" + precision_label = args.precision or os.environ.get("PRECISION") or "unknown" + + lm_path, le_path = find_latest_by_kind(args.results_dir) + + if not lm_path and not le_path: + print(f"### {args.task} Evaluation\n") print(f"> No result JSON found in `{args.results_dir}`.") return - with open(path, "r") as f: - data = json.load(f) + hardware_from_lm = None - hardware = gpu_cpu_from_pretty_env(data.get("pretty_env_info", "")) - m = extract_metrics(data, args.task) + # 1) lm-eval section (if present) + if lm_path: + hardware_from_lm, _ = render_lm_eval_section( + lm_path, args, framework_label, precision_label + ) - print("| Hardware | Framework | Precision | TP | EP | DP Attention | EM Strict | EM Flexible | N (eff) |") - print("|---|---|---:|--:|--:|:--:|--:|--:|--:|") - print(f"| {hardware} | {args.framework} | {args.precision} | {args.tp} | {args.ep} | {str(args.dp_attention).lower()} | " - f"{pct(m['strict'])}{se(m['strict_se'])} | {pct(m['flex'])}{se(m['flex_se'])} | {m['n_eff'] or ''} |") + # Spacer between sections if both exist + if lm_path and le_path: + print("\n") + + # 2) lighteval section (if present) + if le_path: + render_lighteval_section( + le_path, args, framework_label, precision_label, hardware_from_lm + ) - # metadata line - lim = m["limit"] - lim_str = str(int(lim)) if isinstance(lim, (int, float)) else (str(lim) if lim is not None else "") - fewshot = m["fewshot"] if m["fewshot"] is not None else "" - print(f"\n_Model_: `{m['model']}`    _k-shot_: **{fewshot}**    _limit_: **{lim_str}** \n_Source_: `{os.path.basename(path)}`") if __name__ == "__main__": try: From 83a71d22537ee05db9a72692b58f57170220b71b Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Tue, 25 Nov 2025 17:10:01 +0800 Subject: [PATCH 116/214] lighteval test b200_nvd --- .github/workflows/eval-gms8k.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 0951f4254..808ec37bc 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -49,7 +49,7 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: h100-cr_1 + runner: h200-nb_1 image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm From df71abeb987e6857e2b1aa06ce4e994203ea48b9 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Tue, 25 Nov 2025 17:30:39 +0800 Subject: [PATCH 117/214] lighteval test b200_nvd --- .github/workflows/eval-gms8k.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 808ec37bc..cbcbe9423 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -49,7 +49,7 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: h200-nb_1 + runner: b200_nvd-0 image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm From 4aa8d3446b5f3f5f06755bbd3b1765044f54ffc7 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Tue, 25 Nov 2025 17:37:27 +0800 Subject: [PATCH 118/214] lighteval test mi300x-amd_0 --- .github/workflows/eval-gms8k.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index cbcbe9423..233f552f4 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -49,8 +49,8 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: b200_nvd-0 - image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} + runner: mi300x-amd_0 + image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm precision: fp4 From fe2ecd5eaeb25a417e8d65662fe2440819bd0522 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Tue, 25 Nov 2025 17:40:55 +0800 Subject: [PATCH 119/214] lighteval test h100-cw_1 --- .github/workflows/eval-gms8k.yml | 4 ++-- benchmarks/benchmark_lib.sh | 8 +++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 233f552f4..9d06c5623 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -49,8 +49,8 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: mi300x-amd_0 - image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }} + runner: h100-cw_1 + image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm precision: fp4 diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 020b03953..b0618c59a 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -474,8 +474,10 @@ def _greedy_until_impl(self, docs: list[Doc]) -> list[ModelResponse]: return dataset.get_original_order(results) -# Re-apply caching decorator -LiteLLMClient.greedy_until = cached(SamplingMethod.GENERATIVE)(_greedy_until_impl) +# Disable lighteval on-disk caching to avoid filesystem issues with task names +# like "gsm8k|5" becoming part of cache paths on certain filesystems. +# We directly bind the greedy method without the caching decorator. +LiteLLMClient.greedy_until = _greedy_until_impl PY export PYTHONPATH="${patch_dir}:${PYTHONPATH:-}" } @@ -553,4 +555,4 @@ run_eval() { *) echo "Unknown framework '${framework}'"; return 1 ;; esac -} \ No newline at end of file +} From fef016a51779ff50a3c5de2793dd660d9747823f Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Tue, 25 Nov 2025 17:42:27 +0800 Subject: [PATCH 120/214] lighteval test mi300x-cr_0 --- .github/workflows/eval-gms8k.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 9d06c5623..82a588bf6 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -49,8 +49,8 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: h100-cw_1 - image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} + runner: mi300x-cr_0 + image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm precision: fp4 From 124eb70ae63cb1c48ba55ac007ecfa090c8ba007 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Tue, 25 Nov 2025 17:58:24 +0800 Subject: [PATCH 121/214] lighteval test mi325x-tw_1 --- .github/workflows/eval-gms8k.yml | 2 +- benchmarks/gptoss_fp4_b200_docker.sh | 2 +- benchmarks/gptoss_fp4_h100_docker.sh | 2 +- benchmarks/gptoss_fp4_h100_slurm.sh | 2 +- benchmarks/gptoss_fp4_h200_slurm.sh | 2 +- benchmarks/gptoss_fp4_mi300x_docker.sh | 2 +- benchmarks/gptoss_fp4_mi325x_slurm.sh | 2 +- benchmarks/gptoss_fp4_mi355x_docker.sh | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 82a588bf6..46d676b34 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -49,7 +49,7 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: mi300x-cr_0 + runner: mi325x-tw_1 image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh index 1f4679383..f33e23bd6 100644 --- a/benchmarks/gptoss_fp4_b200_docker.sh +++ b/benchmarks/gptoss_fp4_b200_docker.sh @@ -80,7 +80,7 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -run_eval --framework lm-eval --port "$PORT" +#run_eval --framework lm-eval --port "$PORT" run_eval --framework lighteval --task gsm8k --num-fewshot 5 append_lm_eval_summary set +x \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index e9c1cfc5a..463f31b90 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -61,7 +61,7 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -run_eval --framework lm-eval --port "$PORT" +#run_eval --framework lm-eval --port "$PORT" run_eval --framework lighteval --task gsm8k --num-fewshot 5 append_lm_eval_summary set +x \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh index 394d68bc1..19ad98294 100644 --- a/benchmarks/gptoss_fp4_h100_slurm.sh +++ b/benchmarks/gptoss_fp4_h100_slurm.sh @@ -62,7 +62,7 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -run_eval --framework lm-eval --port "$PORT" +#run_eval --framework lm-eval --port "$PORT" run_eval --framework lighteval --task gsm8k --num-fewshot 5 append_lm_eval_summary set +x diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh index 9906b2fa5..e5a3a6961 100644 --- a/benchmarks/gptoss_fp4_h200_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_slurm.sh @@ -75,7 +75,7 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -run_eval --framework lm-eval --port "$PORT" +#run_eval --framework lm-eval --port "$PORT" run_eval --framework lighteval --task gsm8k --num-fewshot 5 append_lm_eval_summary set +x \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh index 777aa2c2d..0dade438d 100644 --- a/benchmarks/gptoss_fp4_mi300x_docker.sh +++ b/benchmarks/gptoss_fp4_mi300x_docker.sh @@ -62,7 +62,7 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -run_eval --framework lm-eval --port "$PORT" +#run_eval --framework lm-eval --port "$PORT" run_eval --framework lighteval --task gsm8k --num-fewshot 5 append_lm_eval_summary set +x \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh index 3394bcc04..b3bdfbec7 100644 --- a/benchmarks/gptoss_fp4_mi325x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh @@ -70,7 +70,7 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -run_eval --framework lm-eval --port "$PORT" +#run_eval --framework lm-eval --port "$PORT" run_eval --framework lighteval --task gsm8k --num-fewshot 5 append_lm_eval_summary set +x \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh index a413acd69..305210cf7 100644 --- a/benchmarks/gptoss_fp4_mi355x_docker.sh +++ b/benchmarks/gptoss_fp4_mi355x_docker.sh @@ -60,7 +60,7 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -run_eval --framework lm-eval --port "$PORT" +#run_eval --framework lm-eval --port "$PORT" run_eval --framework lighteval --task gsm8k --num-fewshot 5 append_lm_eval_summary set +x \ No newline at end of file From 2b0b9860e29fc700acbb7e1489199afe9307c726 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Tue, 25 Nov 2025 18:33:56 +0800 Subject: [PATCH 122/214] lighteval test mi355x-amd_4 --- .github/workflows/eval-gms8k.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 46d676b34..af0baa997 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -49,7 +49,7 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: mi325x-tw_1 + runner: mi355x-amd_5 image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm From dae73454a492bd2ac98e6026aa624b15fff975ec Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Tue, 25 Nov 2025 18:37:14 +0800 Subject: [PATCH 123/214] lighteval test b200-nvd_3 --- .github/workflows/eval-gms8k.yml | 4 ++-- .github/workflows/eval-tmpl.yml | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index af0baa997..308212ca6 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -49,8 +49,8 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: mi355x-amd_5 - image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }} + runner: b200-nvd_3 + image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm precision: fp4 diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml index 23bec7006..8b6537986 100644 --- a/.github/workflows/eval-tmpl.yml +++ b/.github/workflows/eval-tmpl.yml @@ -162,5 +162,4 @@ jobs: sudo rm -rf \ "${{ github.workspace }}/.litellm_cache" \ "${{ github.workspace }}/eval_out"* \ - "${{ github.workspace }}/.cache" \ || true \ No newline at end of file From 993b19f9d1e667200fdda2da874dc429150066c6 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Tue, 25 Nov 2025 19:55:54 +0800 Subject: [PATCH 124/214] lighteval test h100-cw_1 sudo test --- .github/workflows/eval-gms8k.yml | 2 +- .github/workflows/eval-tmpl.yml | 14 ++++++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 308212ca6..9d06c5623 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -49,7 +49,7 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: b200-nvd_3 + runner: h100-cw_1 image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml index 8b6537986..62e37d553 100644 --- a/.github/workflows/eval-tmpl.yml +++ b/.github/workflows/eval-tmpl.yml @@ -79,6 +79,8 @@ jobs: steps: - name: Resource cleanup run: | + sudo mv /home/nvadmin/actions-runner/_work/InferenceMAX/InferenceMAX/.litellm_cache /tmp/ || true + sudo -n rm -rf /home/nvadmin/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out* # Helper to avoid indefinite hangs on flaky tools (Docker/Slurm) safe_timeout() { if command -v timeout >/dev/null 2>&1; then @@ -159,7 +161,11 @@ jobs: - name: Resource cleanup run: | - sudo rm -rf \ - "${{ github.workspace }}/.litellm_cache" \ - "${{ github.workspace }}/eval_out"* \ - || true \ No newline at end of file + ls -lt + pkill -f litellm || true + sleep 2 + if command -v fuser >/dev/null 2>&1; then + fuser -k /home/nvadmin/actions-runner/_work/InferenceMAX/InferenceMAX/.litellm_cache/cache.db 2>/dev/null || true + fi + sudo rm -rf .litellm_cache + sudo rm -rf eval_out* \ No newline at end of file From f5b3a7af372868b66619af8dacda9e0e05f2473a Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Tue, 25 Nov 2025 22:59:59 +0800 Subject: [PATCH 125/214] b200 fix check --- .github/workflows/eval-gms8k.yml | 2 +- .github/workflows/eval-tmpl.yml | 5 ++--- benchmarks/benchmark_lib.sh | 4 ++++ benchmarks/gptoss_fp4_h200_slurm.sh | 2 +- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 9d06c5623..808ec37bc 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -49,7 +49,7 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: h100-cw_1 + runner: h200-nb_1 image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml index 62e37d553..8db652350 100644 --- a/.github/workflows/eval-tmpl.yml +++ b/.github/workflows/eval-tmpl.yml @@ -79,8 +79,6 @@ jobs: steps: - name: Resource cleanup run: | - sudo mv /home/nvadmin/actions-runner/_work/InferenceMAX/InferenceMAX/.litellm_cache /tmp/ || true - sudo -n rm -rf /home/nvadmin/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out* # Helper to avoid indefinite hangs on flaky tools (Docker/Slurm) safe_timeout() { if command -v timeout >/dev/null 2>&1; then @@ -161,7 +159,8 @@ jobs: - name: Resource cleanup run: | - ls -lt + ls -lt eval_out + ls -lt .litellm_cache pkill -f litellm || true sleep 2 if command -v fuser >/dev/null 2>&1; then diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index b0618c59a..b5750495e 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -100,6 +100,7 @@ run_benchmark_serving() { fi set -x + echo "Before benchmark_serving: $(id -u) $(id -g) $(id -un)" >&2 python3 "$BENCH_SERVING_DIR/benchmark_serving.py" \ --model "$model" \ --backend "$backend" \ @@ -269,6 +270,7 @@ run_lm_eval() { export OPENAI_API_KEY=${OPENAI_API_KEY:-EMPTY} set -x + echo "Before lm_eval: $(id -u) $(id -g) $(id -un)" >&2 python3 -m lm_eval --model local-chat-completions --apply_chat_template \ --tasks "${task}" \ --num_fewshot "${num_fewshot}" \ @@ -524,6 +526,7 @@ run_lighteval_eval() { local TASK_SPEC="${task}|${num_fewshot}" set -x + echo "Before lighteval: $(id -u) $(id -g) $(id -un)" >&2 lighteval endpoint litellm \ "${MODEL_ARGS}" \ "${TASK_SPEC}" \ @@ -555,4 +558,5 @@ run_eval() { *) echo "Unknown framework '${framework}'"; return 1 ;; esac + ls -ld /workspace /workspace/eval_out* /workspace/results* } diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh index e5a3a6961..9906b2fa5 100644 --- a/benchmarks/gptoss_fp4_h200_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_slurm.sh @@ -75,7 +75,7 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -#run_eval --framework lm-eval --port "$PORT" +run_eval --framework lm-eval --port "$PORT" run_eval --framework lighteval --task gsm8k --num-fewshot 5 append_lm_eval_summary set +x \ No newline at end of file From ff1eba60c1d7e2dfecb08ba02ebfb81afbf108f2 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Tue, 25 Nov 2025 23:00:23 +0800 Subject: [PATCH 126/214] b200 fix check --- .github/workflows/eval-gms8k.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 808ec37bc..308212ca6 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -49,7 +49,7 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: h200-nb_1 + runner: b200-nvd_3 image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm From d6a52ec14e97dd840a594678144133933334452d Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Tue, 25 Nov 2025 23:01:28 +0800 Subject: [PATCH 127/214] b200 fix check --- .github/workflows/eval-tmpl.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml index 8db652350..c14e8e509 100644 --- a/.github/workflows/eval-tmpl.yml +++ b/.github/workflows/eval-tmpl.yml @@ -87,6 +87,8 @@ jobs: "$@" fi } + sudo rm -rf /home/nvadmin/actions-runner/_work/InferenceMAX/InferenceMAX/.litellm_cache/ + sudo rm -rf /home/nvadmin/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out* host=$(hostname) if [[ "$host" == "b200-81" || "$host" == "b200-80" || "$host" == "b200-79" ]]; then From 4dd7e2162d6366b3bfb7e0483a7f06f10d50c253 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Tue, 25 Nov 2025 23:05:56 +0800 Subject: [PATCH 128/214] b200 fix check --- .github/workflows/eval-tmpl.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml index c14e8e509..94f8ec395 100644 --- a/.github/workflows/eval-tmpl.yml +++ b/.github/workflows/eval-tmpl.yml @@ -87,9 +87,9 @@ jobs: "$@" fi } - sudo rm -rf /home/nvadmin/actions-runner/_work/InferenceMAX/InferenceMAX/.litellm_cache/ - sudo rm -rf /home/nvadmin/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out* - + set -x + sudo rm -rf $GITHUB_WORKSPACE + set +x host=$(hostname) if [[ "$host" == "b200-81" || "$host" == "b200-80" || "$host" == "b200-79" ]]; then if command -v docker >/dev/null 2>&1; then From 37bd3df1c9d848bb3858e6a20b94d1f07288140d Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Tue, 25 Nov 2025 23:13:36 +0800 Subject: [PATCH 129/214] b200 fix check --- .github/workflows/eval-tmpl.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml index 94f8ec395..c6b538548 100644 --- a/.github/workflows/eval-tmpl.yml +++ b/.github/workflows/eval-tmpl.yml @@ -88,7 +88,7 @@ jobs: fi } set -x - sudo rm -rf $GITHUB_WORKSPACE + sudo rm -rf ${GITHUB_WORKSPACE}/* || true set +x host=$(hostname) if [[ "$host" == "b200-81" || "$host" == "b200-80" || "$host" == "b200-79" ]]; then From 43c7c595c94fdd14de3cbdf1869a35513b918abb Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Tue, 25 Nov 2025 23:15:21 +0800 Subject: [PATCH 130/214] b200 fix check --- .github/workflows/eval-tmpl.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml index c6b538548..1491d74d3 100644 --- a/.github/workflows/eval-tmpl.yml +++ b/.github/workflows/eval-tmpl.yml @@ -88,7 +88,7 @@ jobs: fi } set -x - sudo rm -rf ${GITHUB_WORKSPACE}/* || true + sudo rm -rfv ${GITHUB_WORKSPACE}/* || true set +x host=$(hostname) if [[ "$host" == "b200-81" || "$host" == "b200-80" || "$host" == "b200-79" ]]; then From e5a8e3ae30e849e8632e57e75655736bdb2b8bed Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Tue, 25 Nov 2025 23:22:38 +0800 Subject: [PATCH 131/214] b200 fix check --- .github/workflows/eval-tmpl.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml index 1491d74d3..8501af2ea 100644 --- a/.github/workflows/eval-tmpl.yml +++ b/.github/workflows/eval-tmpl.yml @@ -87,9 +87,6 @@ jobs: "$@" fi } - set -x - sudo rm -rfv ${GITHUB_WORKSPACE}/* || true - set +x host=$(hostname) if [[ "$host" == "b200-81" || "$host" == "b200-80" || "$host" == "b200-79" ]]; then if command -v docker >/dev/null 2>&1; then @@ -133,6 +130,9 @@ jobs: echo "[Slurm] Jobs still present after timeout; proceeding" fi fi + set -x + sudo rm -rfv ${GITHUB_WORKSPACE}/* || true + set +x - uses: actions/checkout@v5 with: From 8fb95f4ce84aa906f680eefd5883a29260ab045e Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Tue, 25 Nov 2025 23:29:02 +0800 Subject: [PATCH 132/214] b200 fix check --- .github/workflows/eval-gms8k.yml | 2 +- .github/workflows/eval-tmpl.yml | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 308212ca6..2101ba972 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -49,7 +49,7 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: b200-nvd_3 + runner: b200-nvd_1 image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml index 8501af2ea..4fb1f159c 100644 --- a/.github/workflows/eval-tmpl.yml +++ b/.github/workflows/eval-tmpl.yml @@ -130,9 +130,6 @@ jobs: echo "[Slurm] Jobs still present after timeout; proceeding" fi fi - set -x - sudo rm -rfv ${GITHUB_WORKSPACE}/* || true - set +x - uses: actions/checkout@v5 with: From 237b4e8e9ce03a8cb492ccc1aeba94a87fc5a410 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Tue, 25 Nov 2025 23:29:54 +0800 Subject: [PATCH 133/214] b200 fix check --- .github/workflows/eval-gms8k.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 2101ba972..3456c422d 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -49,7 +49,7 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: b200-nvd_1 + runner: b200-nvd_2 image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm From 79eadc5d958a72898885ffed76530a0d885f9127 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Wed, 26 Nov 2025 19:45:24 +0800 Subject: [PATCH 134/214] Prelimary lighteval for all --- .github/workflows/eval-gms8k.yml | 12 +-- benchmarks/benchmark_lib.sh | 135 +++++++++++++++++------- benchmarks/dsr1_fp4_b200_docker.sh | 4 +- benchmarks/dsr1_fp4_mi355x_docker.sh | 5 +- benchmarks/dsr1_fp8_b200_docker.sh | 7 +- benchmarks/dsr1_fp8_h200_slurm.sh | 4 +- benchmarks/dsr1_fp8_h200_trt_slurm.sh | 5 +- benchmarks/dsr1_fp8_mi300x_docker.sh | 7 +- benchmarks/dsr1_fp8_mi300x_slurm.sh | 5 + benchmarks/dsr1_fp8_mi325x_docker.sh | 4 + benchmarks/dsr1_fp8_mi325x_slurm.sh | 7 +- benchmarks/dsr1_fp8_mi355x_docker.sh | 4 +- benchmarks/dsr1_fp8_mi355x_slurm.sh | 4 +- benchmarks/gptoss_fp4_h200_slurm.sh | 2 +- benchmarks/gptoss_fp4_h200_trt_slurm.sh | 2 +- benchmarks/gptoss_fp4_mi300x_slurm.sh | 2 +- benchmarks/gptoss_fp4_mi325x_docker.sh | 2 +- benchmarks/gptoss_fp4_mi355x_slurm.sh | 2 +- 18 files changed, 151 insertions(+), 62 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 3456c422d..d1b33a6c1 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -49,12 +49,12 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: b200-nvd_2 - image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} - model: ${{ inputs.model || 'openai/gpt-oss-120b' }} - framework: vllm - precision: fp4 - exp-name: ${{ inputs.exp-name || 'gptoss_gsm8k_poc' }} + runner: mi325x-tw_1 + image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' }} + model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }} + framework: sglang + precision: fp8 + exp-name: ${{ inputs.exp-name || 'dsr1_gsm8k_poc' }} tp: '2' ep: '1' dp-attn: false diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index b5750495e..a7ab5c5c9 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -100,7 +100,6 @@ run_benchmark_serving() { fi set -x - echo "Before benchmark_serving: $(id -u) $(id -g) $(id -un)" >&2 python3 "$BENCH_SERVING_DIR/benchmark_serving.py" \ --model "$model" \ --backend "$backend" \ @@ -270,7 +269,6 @@ run_lm_eval() { export OPENAI_API_KEY=${OPENAI_API_KEY:-EMPTY} set -x - echo "Before lm_eval: $(id -u) $(id -g) $(id -un)" >&2 python3 -m lm_eval --model local-chat-completions --apply_chat_template \ --tasks "${task}" \ --num_fewshot "${num_fewshot}" \ @@ -329,27 +327,46 @@ _patch_lighteval_litellm() { patch_dir="$(mktemp -d)" cat > "$patch_dir/sitecustomize.py" <<'PY' import logging +import os import time +import re +from concurrent.futures import ThreadPoolExecutor, as_completed import litellm from tqdm import tqdm litellm.suppress_debug_info = True +litellm.drop_params = True + +# Remove sglang import that crashes +try: + # This is where lighteval's is_package_available lives + from lighteval.utils import imports as le_imports +except Exception: + le_imports = None +else: + _orig_is_package_available = le_imports.is_package_available + + def _patched_is_package_available(pkg: str) -> bool: + # Force "sglang" to look unavailable so that + # lighteval.models.sglang.sglang_model never imports `sglang` + if pkg == "sglang": + return False + return _orig_is_package_available(pkg) + + le_imports.is_package_available = _patched_is_package_available from lighteval.models.endpoints.litellm_model import LiteLLMClient from lighteval.data import GenerativeTaskDataset -from lighteval.tasks.requests import Doc, SamplingMethod +from lighteval.tasks.requests import Doc from lighteval.models.model_output import ModelResponse -from lighteval.utils.cache_management import cached logger = logging.getLogger(__name__) -# --- Patched __call_api: don't retry when we have reasoning_content, enforce chat template on vLLM and avoid stop interference --- -def _patched___call_api(self, prompt, return_logits, max_new_tokens, num_samples, stop_sequence): # noqa: C901 +def _patched___call_api(self, prompt, return_logits, max_new_tokens, num_samples, stop_sequence): # noqa: C901, N802 from lighteval.models.endpoints.litellm_model import LitellmModelResponse response = LitellmModelResponse() - - stop_sequence = None # Important: let the chat template drive turn-taking + # Keep dataset-provided stop sequences to cut early max_new_tokens = self._prepare_max_new_tokens(max_new_tokens) if return_logits and not self.provider == "openai": @@ -360,18 +377,22 @@ def _patched___call_api(self, prompt, return_logits, max_new_tokens, num_samples "messages": prompt, "max_tokens": max_new_tokens, "logprobs": return_logits if self.provider == "openai" else None, - "stop": stop_sequence, # disabled for chat + "stop": stop_sequence, "base_url": self.base_url, "api_key": self.api_key, "n": num_samples, - "caching": True, "timeout": self.timeout, - # vLLM OpenAI server: ensure chat template is applied and an assistant turn is started - "extra_body": { - "use_chat_template": True - }, } + # vLLM/SGLang OpenAI servers: apply chat template and start assistant turn + if ( + self.provider == "openai" + and isinstance(self.base_url, str) + and self.base_url + and ("api.openai.com" not in self.base_url) + ): + kwargs["extra_body"] = {"use_chat_template": True, "add_generation_prompt": True} + if "o1" in self.model: logger.warning("O1 models do not support temperature, top_p, stop sequence. Disabling.") else: @@ -384,15 +405,15 @@ def _patched___call_api(self, prompt, return_logits, max_new_tokens, num_samples try: response = litellm.completion(**kwargs) msg = response.choices[0].message - content = msg.content + content = getattr(msg, "content", None) reasoning = getattr(msg, "reasoning_content", None) # Accept reasoning-only replies if (not content) and reasoning: return response - if not content: - logger.info("Response is empty, retrying without caching") + if not content and LITELLM_CACHE: + logger.info("Empty content with caching on; retrying uncached once") kwargs["caching"] = False response = litellm.completion(**kwargs) @@ -409,8 +430,49 @@ def _patched___call_api(self, prompt, return_logits, max_new_tokens, num_samples logger.error(f"API call failed after {self.API_MAX_RETRY} attempts.") return LitellmModelResponse() -# APPLY PATCH -LiteLLMClient._LiteLLMClient__call_api = _patched___call_api + +def _patched___call_api_parallel(self, prompts, return_logits, max_new_tokens, num_samples, stop_sequence): # noqa: N802 + # Build per-item args + return_logitss = [return_logits for _ in prompts] if not isinstance(return_logits, list) else return_logits + max_new_tokenss = [max_new_tokens for _ in prompts] if not isinstance(max_new_tokens, list) else max_new_tokens + num_sampless = [num_samples for _ in prompts] if not isinstance(num_samples, list) else num_samples + stop_sequencess = [stop_sequence for _ in prompts] + + n = len(prompts) + assert n == len(return_logitss) == len(max_new_tokenss) == len(num_sampless) == len(stop_sequencess), ( + f"Length mismatch: {len(prompts)}, {len(return_logitss)}, {len(max_new_tokenss)}, " + f"{len(num_sampless)}, {len(stop_sequencess)}" + ) + + results = [None] * n + with ThreadPoolExecutor(self.concurrent_requests) as executor: + futures = [] + for idx in range(n): + fut = executor.submit( + self._LiteLLMClient__call_api, + prompts[idx], + return_logitss[idx], + max_new_tokenss[idx], + num_sampless[idx], + stop_sequencess[idx], + ) + fut._le_idx = idx # attach index for order restoration + futures.append(fut) + + for fut in tqdm(as_completed(futures), total=n, disable=self.disable_tqdm): + idx = getattr(fut, "_le_idx", None) + try: + res = fut.result() + except Exception: + res = None + if idx is not None: + results[idx] = res + + if any(r is None for r in results): + raise ValueError("Some entries are not annotated due to errors in __call_api_parallel, please inspect and retry.") + + return results + def _greedy_until_impl(self, docs: list[Doc]) -> list[ModelResponse]: dataset = GenerativeTaskDataset(requests=docs, num_dataset_splits=self.DATASET_SPLITS) @@ -423,7 +485,6 @@ def _greedy_until_impl(self, docs: list[Doc]) -> list[ModelResponse]: position=0, disable=self.disable_tqdm, ): - # FIX: only build contexts for the current split contexts = [self.prompt_manager.prepare_prompt_api(doc) for doc in split] max_new_tokens = split[0].generation_size @@ -443,22 +504,18 @@ def _greedy_until_impl(self, docs: list[Doc]) -> list[ModelResponse]: ) for response, context in zip(responses, contexts): - raw_contents = [(choice.message.content or "").strip() for choice in response.choices] - raw_reasonings = [(getattr(choice.message, "reasoning_content", None) or "").strip() for choice in response.choices] - merged_texts: list[str] = [] reasonings: list[str | None] = [] - for c, r in zip(raw_contents, raw_reasonings): - if c and r: - merged_texts.append(f"{r}\n\n{c}") - elif c: - merged_texts.append(c) - elif r: - merged_texts.append(f"{r}") - else: - merged_texts.append("") - reasonings.append(r if r != "" else None) + for choice in response.choices: + msg = choice.message + raw_content = getattr(msg, "content", None) or "" + reasoning = getattr(msg, "reasoning_content", None) + + # For answer extraction, use only the content field + # The reasoning is stored separately for logging/debugging + merged_texts.append(raw_content.strip() if raw_content else "") + reasonings.append(reasoning if reasoning else None) if not merged_texts or merged_texts[0] is None: merged_texts = [""] @@ -476,10 +533,10 @@ def _greedy_until_impl(self, docs: list[Doc]) -> list[ModelResponse]: return dataset.get_original_order(results) -# Disable lighteval on-disk caching to avoid filesystem issues with task names -# like "gsm8k|5" becoming part of cache paths on certain filesystems. -# We directly bind the greedy method without the caching decorator. -LiteLLMClient.greedy_until = _greedy_until_impl +# Bind patches +LiteLLMClient._LiteLLMClient__call_api = _patched___call_api +LiteLLMClient._LiteLLMClient__call_api_parallel = _patched___call_api_parallel +#LiteLLMClient.greedy_until = _greedy_until_impl PY export PYTHONPATH="${patch_dir}:${PYTHONPATH:-}" } @@ -522,11 +579,11 @@ run_lighteval_eval() { local base_url="http://0.0.0.0:${port}/v1" export OPENAI_API_KEY="${OPENAI_API_KEY:-EMPTY}" - local MODEL_ARGS="model_name=${lite_model},base_url=${base_url},api_key=${OPENAI_API_KEY},generation_parameters={temperature:0.0,max_new_tokens:2056}" + + local MODEL_ARGS="model_name=${lite_model},base_url=${base_url},api_key=${OPENAI_API_KEY},generation_parameters={temperature:0.0,max_new_tokens:2048},concurrent_requests=8" local TASK_SPEC="${task}|${num_fewshot}" set -x - echo "Before lighteval: $(id -u) $(id -g) $(id -un)" >&2 lighteval endpoint litellm \ "${MODEL_ARGS}" \ "${TASK_SPEC}" \ diff --git a/benchmarks/dsr1_fp4_b200_docker.sh b/benchmarks/dsr1_fp4_b200_docker.sh index 656085fef..a2d0fb081 100644 --- a/benchmarks/dsr1_fp4_b200_docker.sh +++ b/benchmarks/dsr1_fp4_b200_docker.sh @@ -49,5 +49,7 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -run_lm_eval --port "$PORT" +#run_lm_eval --port "$PORT" +MODEL_NAME="openai/$MODEL" +run_eval --framework lighteval --task gsm8k --num-fewshot 5 append_lm_eval_summary \ No newline at end of file diff --git a/benchmarks/dsr1_fp4_mi355x_docker.sh b/benchmarks/dsr1_fp4_mi355x_docker.sh index f19b6df2e..b3f2a0e96 100644 --- a/benchmarks/dsr1_fp4_mi355x_docker.sh +++ b/benchmarks/dsr1_fp4_mi355x_docker.sh @@ -51,4 +51,7 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ - +MODEL_NAME="openai/$MODEL" +run_eval --framework lighteval --task gsm8k --num-fewshot 5 +append_lm_eval_summary +set +x diff --git a/benchmarks/dsr1_fp8_b200_docker.sh b/benchmarks/dsr1_fp8_b200_docker.sh index e68397661..afb5c1655 100644 --- a/benchmarks/dsr1_fp8_b200_docker.sh +++ b/benchmarks/dsr1_fp8_b200_docker.sh @@ -60,5 +60,8 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -run_lm_eval --port "$PORT" -append_lm_eval_summary \ No newline at end of file +#run_lm_eval --port "$PORT" +MODEL_NAME="openai/$MODEL" +run_eval --framework lighteval --task gsm8k --num-fewshot 5 +append_lm_eval_summary +set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_h200_slurm.sh b/benchmarks/dsr1_fp8_h200_slurm.sh index 8e8ec7469..4c75cc17e 100644 --- a/benchmarks/dsr1_fp8_h200_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_slurm.sh @@ -65,5 +65,7 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -run_lm_eval --port "$PORT" +#run_lm_eval --port "$PORT" +MODEL_NAME="openai/$MODEL" +run_eval --framework lighteval --task gsm8k --num-fewshot 5 append_lm_eval_summary \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh index c829e66b5..797c2b67c 100644 --- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh @@ -91,5 +91,8 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -run_lm_eval --port "$PORT" +#run_lm_eval --port "$PORT" +MODEL_NAME="openai/$MODEL" +run_eval --framework lighteval --task gsm8k --num-fewshot 5 append_lm_eval_summary +set +x diff --git a/benchmarks/dsr1_fp8_mi300x_docker.sh b/benchmarks/dsr1_fp8_mi300x_docker.sh index 3e604f3ca..cec263322 100644 --- a/benchmarks/dsr1_fp8_mi300x_docker.sh +++ b/benchmarks/dsr1_fp8_mi300x_docker.sh @@ -58,5 +58,8 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -run_lm_eval --port "$PORT" -append_lm_eval_summary \ No newline at end of file +#run_lm_eval --port "$PORT" +MODEL_NAME="openai/$MODEL" +run_eval --framework lighteval --task gsm8k --num-fewshot 5 +append_lm_eval_summary +set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_mi300x_slurm.sh b/benchmarks/dsr1_fp8_mi300x_slurm.sh index 5fad7a587..f302c942f 100644 --- a/benchmarks/dsr1_fp8_mi300x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi300x_slurm.sh @@ -66,3 +66,8 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +MODEL_NAME="openai/$MODEL" +run_eval --framework lighteval --task gsm8k --num-fewshot 5 +append_lm_eval_summary +set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_mi325x_docker.sh b/benchmarks/dsr1_fp8_mi325x_docker.sh index 565b8fb45..ffe79da5c 100644 --- a/benchmarks/dsr1_fp8_mi325x_docker.sh +++ b/benchmarks/dsr1_fp8_mi325x_docker.sh @@ -48,3 +48,7 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ +MODEL_NAME="openai/$MODEL" +run_eval --framework lighteval --task gsm8k --num-fewshot 5 +append_lm_eval_summary +set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_mi325x_slurm.sh b/benchmarks/dsr1_fp8_mi325x_slurm.sh index 4e66a64fb..4cfd8fdad 100644 --- a/benchmarks/dsr1_fp8_mi325x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi325x_slurm.sh @@ -44,5 +44,8 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -run_lm_eval --port "$PORT" -append_lm_eval_summary \ No newline at end of file +#run_lm_eval --port "$PORT" +MODEL_NAME="openai/$MODEL" +run_eval --framework lighteval --task gsm8k --num-fewshot 5 +append_lm_eval_summary +set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_mi355x_docker.sh b/benchmarks/dsr1_fp8_mi355x_docker.sh index 17e51344a..a5a6eee2a 100644 --- a/benchmarks/dsr1_fp8_mi355x_docker.sh +++ b/benchmarks/dsr1_fp8_mi355x_docker.sh @@ -59,6 +59,8 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -run_lm_eval --port "$PORT" +#run_lm_eval --port "$PORT" +MODEL_NAME="openai/$MODEL" +run_eval --framework lighteval --task gsm8k --num-fewshot 5 append_lm_eval_summary set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_mi355x_slurm.sh b/benchmarks/dsr1_fp8_mi355x_slurm.sh index b16c8e247..fe64e8cdd 100644 --- a/benchmarks/dsr1_fp8_mi355x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi355x_slurm.sh @@ -53,6 +53,8 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -run_lm_eval --port "$PORT" +#run_lm_eval --port "$PORT" +MODEL_NAME="openai/$MODEL" +run_eval --framework lighteval --task gsm8k --num-fewshot 5 append_lm_eval_summary set +x \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh index 9906b2fa5..e5a3a6961 100644 --- a/benchmarks/gptoss_fp4_h200_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_slurm.sh @@ -75,7 +75,7 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -run_eval --framework lm-eval --port "$PORT" +#run_eval --framework lm-eval --port "$PORT" run_eval --framework lighteval --task gsm8k --num-fewshot 5 append_lm_eval_summary set +x \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh index 3c959a7b1..620ddfb42 100644 --- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh @@ -80,5 +80,5 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -run_lm_eval --port "$PORT" +#run_lm_eval --port "$PORT" append_lm_eval_summary \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_mi300x_slurm.sh b/benchmarks/gptoss_fp4_mi300x_slurm.sh index b0ba7db04..8dbeefcf2 100644 --- a/benchmarks/gptoss_fp4_mi300x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi300x_slurm.sh @@ -71,6 +71,6 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -run_lm_eval --port "$PORT" +#run_lm_eval --port "$PORT" append_lm_eval_summary set +x \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_mi325x_docker.sh b/benchmarks/gptoss_fp4_mi325x_docker.sh index ccfe6e1c3..db330a1b8 100644 --- a/benchmarks/gptoss_fp4_mi325x_docker.sh +++ b/benchmarks/gptoss_fp4_mi325x_docker.sh @@ -58,6 +58,6 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -run_lm_eval --port "$PORT" +#run_lm_eval --port "$PORT" append_lm_eval_summary set +x \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_mi355x_slurm.sh b/benchmarks/gptoss_fp4_mi355x_slurm.sh index 0dd860bb1..a999303b1 100644 --- a/benchmarks/gptoss_fp4_mi355x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi355x_slurm.sh @@ -61,6 +61,6 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -run_lm_eval --port "$PORT" +#run_lm_eval --port "$PORT" append_lm_eval_summary set +x \ No newline at end of file From a2d77ffdf28b07ff3c224b7177c52d9da200d693 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Wed, 26 Nov 2025 22:02:28 +0800 Subject: [PATCH 135/214] Prelimary lighteval for all 2 - fixed TP --- .github/workflows/eval-gms8k.yml | 2 +- .github/workflows/eval-tmpl.yml | 7 ++----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index d1b33a6c1..13556dc34 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -55,7 +55,7 @@ jobs: framework: sglang precision: fp8 exp-name: ${{ inputs.exp-name || 'dsr1_gsm8k_poc' }} - tp: '2' + tp: '8' ep: '1' dp-attn: false port: ${{ inputs.port || '8888' }} diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml index 4fb1f159c..b77d57c5b 100644 --- a/.github/workflows/eval-tmpl.yml +++ b/.github/workflows/eval-tmpl.yml @@ -158,12 +158,9 @@ jobs: - name: Resource cleanup run: | - ls -lt eval_out - ls -lt .litellm_cache pkill -f litellm || true + pkill -f lighteval || true sleep 2 - if command -v fuser >/dev/null 2>&1; then - fuser -k /home/nvadmin/actions-runner/_work/InferenceMAX/InferenceMAX/.litellm_cache/cache.db 2>/dev/null || true - fi + sudo rm -rf .litellm_cache sudo rm -rf eval_out* \ No newline at end of file From 4e139a03643998c398157a482f0009262756cca3 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Wed, 26 Nov 2025 22:38:38 +0800 Subject: [PATCH 136/214] Prelimary lighteval for all 3 --- .github/workflows/eval-gms8k.yml | 2 +- .github/workflows/eval-tmpl.yml | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 13556dc34..239a91f2b 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -49,7 +49,7 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: mi325x-tw_1 + runner: mi325x-tw_0 image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' }} model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }} framework: sglang diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml index b77d57c5b..6e0d3bbb7 100644 --- a/.github/workflows/eval-tmpl.yml +++ b/.github/workflows/eval-tmpl.yml @@ -64,11 +64,11 @@ env: NUM_FEWSHOT: ${{ inputs['num-fewshot'] }} LIMIT: ${{ inputs.limit }} EVAL_RESULT_DIR: eval_out - CONC: '4' - MAX_MODEL_LEN: '8192' + CONC: '32' + MAX_MODEL_LEN: '4096' ISL: 1024 OSL: 1024 - RANDOM_RANGE_RATIO: '1.0' + RANDOM_RANGE_RATIO: '0.8' RESULT_FILENAME: results jobs: From 76b8c2cf4a7743350ca4ce816a56ae16d0695d0a Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Thu, 27 Nov 2025 13:37:03 +0800 Subject: [PATCH 137/214] Fix lighteval 1 --- benchmarks/benchmark_lib.sh | 4 +++- benchmarks/dsr1_fp8_mi325x_slurm.sh | 2 +- utils/evals/custom_gsm8k.py | 20 ++++++++++++++++++++ 3 files changed, 24 insertions(+), 2 deletions(-) create mode 100644 utils/evals/custom_gsm8k.py diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index a7ab5c5c9..4afb109d8 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -548,6 +548,7 @@ run_lighteval_eval() { local num_fewshot="${NUM_FEWSHOT:-5}" local results_dir="${EVAL_RESULT_DIR:-eval_out_lighteval}" local max_samples=0 + local concurrent_requests=8 while [[ $# -gt 0 ]]; do case $1 in @@ -556,6 +557,7 @@ run_lighteval_eval() { --num-fewshot) num_fewshot="$2"; shift 2 ;; --results-dir) results_dir="$2"; shift 2 ;; --max-samples) max_samples="$2"; shift 2 ;; + --concurrent-requests) concurrent_requests="$2"; shift 2 ;; *) echo "Unknown parameter: $1"; return 1 ;; esac done @@ -580,7 +582,7 @@ run_lighteval_eval() { export OPENAI_API_KEY="${OPENAI_API_KEY:-EMPTY}" - local MODEL_ARGS="model_name=${lite_model},base_url=${base_url},api_key=${OPENAI_API_KEY},generation_parameters={temperature:0.0,max_new_tokens:2048},concurrent_requests=8" + local MODEL_ARGS="model_name=${lite_model},base_url=${base_url},api_key=${OPENAI_API_KEY},generation_parameters={temperature:0.0,max_new_tokens:2048},concurrent_requests=${concurrent_requests}" local TASK_SPEC="${task}|${num_fewshot}" set -x diff --git a/benchmarks/dsr1_fp8_mi325x_slurm.sh b/benchmarks/dsr1_fp8_mi325x_slurm.sh index 4cfd8fdad..b63495024 100644 --- a/benchmarks/dsr1_fp8_mi325x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi325x_slurm.sh @@ -46,6 +46,6 @@ run_benchmark_serving \ # After throughput, run evaluation (defaults to GSM8K) #run_lm_eval --port "$PORT" MODEL_NAME="openai/$MODEL" -run_eval --framework lighteval --task gsm8k --num-fewshot 5 +run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC append_lm_eval_summary set +x \ No newline at end of file diff --git a/utils/evals/custom_gsm8k.py b/utils/evals/custom_gsm8k.py new file mode 100644 index 000000000..4449188fa --- /dev/null +++ b/utils/evals/custom_gsm8k.py @@ -0,0 +1,20 @@ +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.tasks.gsm8k import gsm8k_prompt + +gsm8k_long = LightevalTaskConfig( + name="gsm8k_long", + prompt_function=gsm8k_prompt, + hf_repo="openai/gsm8k", + hf_subset="main", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling_from_train", + generation_size=768, # raise this as needed + metrics=[Metrics.expr_gold_metric], + stop_sequence=None, # avoid early stop on "Question:" + version=0, +) + +TASKS_TABLE = [gsm8k_long] From fda8e2c30902e03b9e40345a6fb3e21c91125d64 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Thu, 27 Nov 2025 14:04:48 +0800 Subject: [PATCH 138/214] Check both --- .github/workflows/eval-gms8k.yml | 2 +- benchmarks/benchmark_lib.sh | 7 +++++-- benchmarks/dsr1_fp8_mi325x_slurm.sh | 2 +- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 239a91f2b..3e71d6885 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -61,4 +61,4 @@ jobs: port: ${{ inputs.port || '8888' }} eval-task: gsm8k num-fewshot: ${{ inputs.num_fewshot || '5' }} - limit: ${{ inputs.limit || '200' }} + limit: ${{ inputs.limit || '200' }} diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 4afb109d8..cf05c6984 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -246,6 +246,7 @@ run_lm_eval() { local gen_max_tokens=4096 local temperature=0 local top_p=1 + local concurrent_requests=32 while [[ $# -gt 0 ]]; do case $1 in @@ -257,6 +258,7 @@ run_lm_eval() { --gen-max-tokens) gen_max_tokens="$2"; shift 2 ;; --temperature) temperature="$2"; shift 2 ;; --top-p) top_p="$2"; shift 2 ;; + --concurrent-requests) concurrent_requests="$2"; shift 2 ;; *) echo "Unknown parameter: $1"; return 1 ;; esac done @@ -274,7 +276,7 @@ run_lm_eval() { --num_fewshot "${num_fewshot}" \ --batch_size "${batch_size}" \ --output_path "/workspace/${results_dir}" \ - --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=,max_retries=3,num_concurrent=32,tokenized_requests=False" \ + --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=,max_retries=3,num_concurrent=${concurrent_requests},tokenized_requests=False" \ --gen_kwargs "max_tokens=${gen_max_tokens},temperature=${temperature},top_p=${top_p}" set +x } @@ -548,7 +550,7 @@ run_lighteval_eval() { local num_fewshot="${NUM_FEWSHOT:-5}" local results_dir="${EVAL_RESULT_DIR:-eval_out_lighteval}" local max_samples=0 - local concurrent_requests=8 + local concurrent_requests=32 while [[ $# -gt 0 ]]; do case $1 in @@ -590,6 +592,7 @@ run_lighteval_eval() { "${MODEL_ARGS}" \ "${TASK_SPEC}" \ --output-dir "/workspace/${results_dir}" \ + --custom-tasks utils/evals/custom_gsm8k.py \ --max-samples "${max_samples}" set +x } diff --git a/benchmarks/dsr1_fp8_mi325x_slurm.sh b/benchmarks/dsr1_fp8_mi325x_slurm.sh index b63495024..641e2cde9 100644 --- a/benchmarks/dsr1_fp8_mi325x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi325x_slurm.sh @@ -44,7 +44,7 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -#run_lm_eval --port "$PORT" +run_lm_eval --port "$PORT" --concurrent-requests $CONC MODEL_NAME="openai/$MODEL" run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC append_lm_eval_summary From 2e7c12783dfa0647e1655424c1e9e4f8681f5cff Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Thu, 27 Nov 2025 14:55:02 +0800 Subject: [PATCH 139/214] lm-eval check --- benchmarks/dsr1_fp8_mi325x_slurm.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmarks/dsr1_fp8_mi325x_slurm.sh b/benchmarks/dsr1_fp8_mi325x_slurm.sh index 641e2cde9..a19302f15 100644 --- a/benchmarks/dsr1_fp8_mi325x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi325x_slurm.sh @@ -44,8 +44,9 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) +MODEL_NAME="$MODEL" run_lm_eval --port "$PORT" --concurrent-requests $CONC -MODEL_NAME="openai/$MODEL" -run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC +#MODEL_NAME="openai/$MODEL" +#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC append_lm_eval_summary set +x \ No newline at end of file From 867bfc3cdb1043e89db13e53e5428caaa042ade5 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Thu, 27 Nov 2025 14:55:41 +0800 Subject: [PATCH 140/214] lm-eval check --- .github/workflows/eval-gms8k.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 3e71d6885..4a12776af 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -49,7 +49,7 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: mi325x-tw_0 + runner: mi325x-tw_0 image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' }} model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }} framework: sglang From 8cbe81f4d8e33842fa4e1c88847837908564c0ea Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Thu, 27 Nov 2025 16:41:05 +0800 Subject: [PATCH 141/214] lm-eval check --- .github/workflows/eval-gms8k.yml | 14 +++++++------- benchmarks/benchmark_lib.sh | 2 -- benchmarks/gptoss_fp4_h100_slurm.sh | 4 ++-- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 4a12776af..eea18705c 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -49,13 +49,13 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: mi325x-tw_0 - image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' }} - model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }} - framework: sglang - precision: fp8 - exp-name: ${{ inputs.exp-name || 'dsr1_gsm8k_poc' }} - tp: '8' + runner: h100-cw_1 + image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} + model: ${{ inputs.model || 'openai/gpt-oss-120b' }} + framework: vllm + precision: fp4 + exp-name: ${{ inputs.exp-name || 'gptoss_gsm8k_poc' }} + tp: '4' ep: '1' dp-attn: false port: ${{ inputs.port || '8888' }} diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index cf05c6984..d76b6fbd1 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -619,6 +619,4 @@ run_eval() { lighteval) run_lighteval_eval "${forwarded[@]}" ;; *) echo "Unknown framework '${framework}'"; return 1 ;; esac - - ls -ld /workspace /workspace/eval_out* /workspace/results* } diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh index 19ad98294..ee2f37c4a 100644 --- a/benchmarks/gptoss_fp4_h100_slurm.sh +++ b/benchmarks/gptoss_fp4_h100_slurm.sh @@ -62,7 +62,7 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -#run_eval --framework lm-eval --port "$PORT" -run_eval --framework lighteval --task gsm8k --num-fewshot 5 +run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( (CONC * 3 + 1)/2 )) +#run_eval --framework lighteval --task gsm8k --num-fewshot 5 append_lm_eval_summary set +x From 1b3b79f75fbd7500938821fdafda118655515bff Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Thu, 27 Nov 2025 17:03:52 +0800 Subject: [PATCH 142/214] lm-eva l optimization --- benchmarks/benchmark_lib.sh | 23 +++++++++++++++++++++++ benchmarks/gptoss_fp4_h100_slurm.sh | 4 ++-- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index d76b6fbd1..f11fc0289 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -232,6 +232,29 @@ def _safe_mc_apply(self, resps, docs): return out ex.MultiChoiceRegexFilter.apply = _safe_mc_apply + +def _le_parse_generations(outputs, **kwargs): + res = [] + if not isinstance(outputs, list): + outputs = [outputs] + for out in (outputs or []): + try: + choices = out.get("choices", []) + tmp = ["" for _ in choices] + for choice in choices: + idx = choice.get("index", 0) + msg = (choice.get("message") or {}) + content = msg.get("content") + if content in (None, "", []): + content = msg.get("reasoning_content") or "" + tmp[idx] = content + except Exception: + tmp = [""] + res.extend(tmp) + return res + +# Keep staticmethod semantics +_LCC.parse_generations = staticmethod(_le_parse_generations) PY export PYTHONPATH="${patch_dir}:${PYTHONPATH:-}" } diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh index ee2f37c4a..3c84888b9 100644 --- a/benchmarks/gptoss_fp4_h100_slurm.sh +++ b/benchmarks/gptoss_fp4_h100_slurm.sh @@ -56,13 +56,13 @@ run_benchmark_serving \ --input-len "$ISL" \ --output-len "$OSL" \ --random-range-ratio "$RANDOM_RANGE_RATIO" \ - --num-prompts $(( $CONC * 10 )) \ + --num-prompts $(( $CONC * 1 )) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( (CONC * 3 + 1)/2 )) +run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) #$(( (CONC * 3 + 1)/2 )) #run_eval --framework lighteval --task gsm8k --num-fewshot 5 append_lm_eval_summary set +x From 65f03037b658ff6603d985384173b2e8b2bd0b4b Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 28 Nov 2025 00:39:18 +0800 Subject: [PATCH 143/214] mi325x test --- .github/workflows/eval-gms8k.yml | 4 +-- benchmarks/benchmark_lib.sh | 10 ++---- benchmarks/gptoss_fp4_mi325x_slurm.sh | 5 +-- utils/evals/gsm8k.yaml | 45 +++++++++++++++++++++++++++ 4 files changed, 53 insertions(+), 11 deletions(-) create mode 100644 utils/evals/gsm8k.yaml diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index eea18705c..79173b639 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -49,8 +49,8 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: h100-cw_1 - image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} + runner: mi325x-tw_1 + image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }} model: ${{ inputs.model || 'openai/gpt-oss-120b' }} framework: vllm precision: fp4 diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index f11fc0289..29698e52e 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -265,7 +265,6 @@ run_lm_eval() { local task="${EVAL_TASK:-gsm8k}" local num_fewshot="${NUM_FEWSHOT:-5}" local results_dir="${EVAL_RESULT_DIR:-eval_out}" - local batch_size=3 local gen_max_tokens=4096 local temperature=0 local top_p=1 @@ -277,7 +276,6 @@ run_lm_eval() { --task) task="$2"; shift 2 ;; --num-fewshot) num_fewshot="$2"; shift 2 ;; --results-dir) results_dir="$2"; shift 2 ;; - --batch-size) batch_size="$2"; shift 2 ;; --gen-max-tokens) gen_max_tokens="$2"; shift 2 ;; --temperature) temperature="$2"; shift 2 ;; --top-p) top_p="$2"; shift 2 ;; @@ -295,11 +293,10 @@ run_lm_eval() { set -x python3 -m lm_eval --model local-chat-completions --apply_chat_template \ - --tasks "${task}" \ + --tasks "utils/evals/${task}.yaml" \ --num_fewshot "${num_fewshot}" \ - --batch_size "${batch_size}" \ --output_path "/workspace/${results_dir}" \ - --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=,max_retries=3,num_concurrent=${concurrent_requests},tokenized_requests=False" \ + --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=,max_retries=2,num_concurrent=${concurrent_requests},tokenized_requests=False" \ --gen_kwargs "max_tokens=${gen_max_tokens},temperature=${temperature},top_p=${top_p}" set +x } @@ -606,8 +603,7 @@ run_lighteval_eval() { local base_url="http://0.0.0.0:${port}/v1" export OPENAI_API_KEY="${OPENAI_API_KEY:-EMPTY}" - - local MODEL_ARGS="model_name=${lite_model},base_url=${base_url},api_key=${OPENAI_API_KEY},generation_parameters={temperature:0.0,max_new_tokens:2048},concurrent_requests=${concurrent_requests}" + local MODEL_ARGS="model_name=${lite_model},base_url=${base_url},api_key=${OPENAI_API_KEY},generation_parameters={temperature:0.0,top_p=1,max_new_tokens:2048},concurrent_requests=${concurrent_requests}" local TASK_SPEC="${task}|${num_fewshot}" set -x diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh index b3bdfbec7..bac78918d 100644 --- a/benchmarks/gptoss_fp4_mi325x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh @@ -70,7 +70,8 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -#run_eval --framework lm-eval --port "$PORT" -run_eval --framework lighteval --task gsm8k --num-fewshot 5 +MODEL_NAME="$MODEL" +run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) +#run_eval --framework lighteval --task gsm8k --num-fewshot 5 append_lm_eval_summary set +x \ No newline at end of file diff --git a/utils/evals/gsm8k.yaml b/utils/evals/gsm8k.yaml new file mode 100644 index 000000000..3d9e5ce3b --- /dev/null +++ b/utils/evals/gsm8k.yaml @@ -0,0 +1,45 @@ +tag: + - math_word_problems +task: gsm8k +dataset_path: gsm8k +dataset_name: main +output_type: generate_until +training_split: train +fewshot_split: train +test_split: test +doc_to_text: "Question: {{question}}\nEnd your answer with: #### \nAnswer:" +doc_to_target: "{{answer}}" #" {{answer.split('### ')[-1].rstrip()}}" +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: true + ignore_punctuation: false + regexes_to_ignore: + - "," + - "\\$" + - "(?s).*#### " + - "\\.$" +generation_kwargs: + until: + - "Question:" + - "" + - "<|im_end|>" + do_sample: false + temperature: 0.0 +repeats: 1 +num_fewshot: 5 +filter_list: + - name: "strict-match" + filter: + - function: "regex" + regex_pattern: "#### (\\-?[0-9\\.\\,]+)" + - function: "take_first" + - name: "flexible-extract" + filter: + - function: "regex" + group_select: -1 + regex_pattern: "(-?[$0-9.,]{2,})|(-?[0-9]+)" + - function: "take_first" +metadata: + version: 3.0 From ddd3862ff73f0b4970b7f06ecfccb8a1ccae4a5b Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 28 Nov 2025 00:51:36 +0800 Subject: [PATCH 144/214] mi325x test --- .github/workflows/eval-gms8k.yml | 2 +- benchmarks/gptoss_fp4_mi325x_slurm.sh | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 79173b639..c4103fd0f 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -55,7 +55,7 @@ jobs: framework: vllm precision: fp4 exp-name: ${{ inputs.exp-name || 'gptoss_gsm8k_poc' }} - tp: '4' + tp: '2' ep: '1' dp-attn: false port: ${{ inputs.port || '8888' }} diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh index bac78918d..40ed241f7 100644 --- a/benchmarks/gptoss_fp4_mi325x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh @@ -70,7 +70,6 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -MODEL_NAME="$MODEL" run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) #run_eval --framework lighteval --task gsm8k --num-fewshot 5 append_lm_eval_summary From 30ad3ba00effb5f089f49a81153b811c3b1a9330 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 28 Nov 2025 13:35:50 +0800 Subject: [PATCH 145/214] all change, test deepseek --- .github/workflows/eval-gms8k.yml | 14 +-- benchmarks/dsr1_fp4_b200_docker.sh | 9 +- benchmarks/dsr1_fp4_b200_trt_slurm.sh | 11 +- benchmarks/dsr1_fp4_mi355x_docker.sh | 8 +- benchmarks/dsr1_fp4_mi355x_slurm.sh | 143 +----------------------- benchmarks/dsr1_fp8_b200_docker.sh | 6 +- benchmarks/dsr1_fp8_b200_trt_slurm.sh | 11 +- benchmarks/dsr1_fp8_h200_slurm.sh | 9 +- benchmarks/dsr1_fp8_h200_trt_slurm.sh | 16 +-- benchmarks/dsr1_fp8_mi300x_docker.sh | 6 +- benchmarks/dsr1_fp8_mi300x_slurm.sh | 6 +- benchmarks/dsr1_fp8_mi325x_docker.sh | 6 +- benchmarks/dsr1_fp8_mi325x_slurm.sh | 3 +- benchmarks/dsr1_fp8_mi355x_docker.sh | 6 +- benchmarks/dsr1_fp8_mi355x_slurm.sh | 6 +- benchmarks/gptoss_fp4_b200_docker.sh | 2 +- benchmarks/gptoss_fp4_b200_trt_slurm.sh | 12 +- benchmarks/gptoss_fp4_h100_docker.sh | 4 +- benchmarks/gptoss_fp4_h100_slurm.sh | 2 +- benchmarks/gptoss_fp4_h200_slurm.sh | 4 +- benchmarks/gptoss_fp4_h200_trt_slurm.sh | 20 +--- benchmarks/gptoss_fp4_mi300x_docker.sh | 4 +- benchmarks/gptoss_fp4_mi300x_slurm.sh | 2 +- benchmarks/gptoss_fp4_mi325x_docker.sh | 2 +- benchmarks/gptoss_fp4_mi325x_slurm.sh | 9 +- benchmarks/gptoss_fp4_mi355x_docker.sh | 4 +- benchmarks/gptoss_fp4_mi355x_slurm.sh | 2 +- 27 files changed, 86 insertions(+), 241 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index c4103fd0f..3e71d6885 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -49,13 +49,13 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: mi325x-tw_1 - image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }} - model: ${{ inputs.model || 'openai/gpt-oss-120b' }} - framework: vllm - precision: fp4 - exp-name: ${{ inputs.exp-name || 'gptoss_gsm8k_poc' }} - tp: '2' + runner: mi325x-tw_0 + image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' }} + model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }} + framework: sglang + precision: fp8 + exp-name: ${{ inputs.exp-name || 'dsr1_gsm8k_poc' }} + tp: '8' ep: '1' dp-attn: false port: ${{ inputs.port || '8888' }} diff --git a/benchmarks/dsr1_fp4_b200_docker.sh b/benchmarks/dsr1_fp4_b200_docker.sh index a2d0fb081..08469715e 100644 --- a/benchmarks/dsr1_fp4_b200_docker.sh +++ b/benchmarks/dsr1_fp4_b200_docker.sh @@ -49,7 +49,8 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -#run_lm_eval --port "$PORT" -MODEL_NAME="openai/$MODEL" -run_eval --framework lighteval --task gsm8k --num-fewshot 5 -append_lm_eval_summary \ No newline at end of file +run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) +#MODEL_NAME="openai/$MODEL" +#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC +append_lm_eval_summary +set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh index b4227e428..3700c3b40 100644 --- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh @@ -1,16 +1,13 @@ #!/usr/bin/env bash -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE +# === Required Env Vars === # MODEL +# TP +# CONC # ISL # OSL # MAX_MODEL_LEN # RANDOM_RANGE_RATIO -# TP -# CONC # RESULT_FILENAME # PORT_OFFSET # DP_ATTENTION @@ -118,4 +115,4 @@ run_benchmark_serving \ --num-prompts $(( $CONC * 10 )) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ + --result-dir /workspace/ \ No newline at end of file diff --git a/benchmarks/dsr1_fp4_mi355x_docker.sh b/benchmarks/dsr1_fp4_mi355x_docker.sh index b3f2a0e96..2c5bd1e42 100644 --- a/benchmarks/dsr1_fp4_mi355x_docker.sh +++ b/benchmarks/dsr1_fp4_mi355x_docker.sh @@ -51,7 +51,9 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ -MODEL_NAME="openai/$MODEL" -run_eval --framework lighteval --task gsm8k --num-fewshot 5 +# After throughput, run evaluation (defaults to GSM8K) +run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) +#MODEL_NAME="openai/$MODEL" +#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC append_lm_eval_summary -set +x +set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp4_mi355x_slurm.sh b/benchmarks/dsr1_fp4_mi355x_slurm.sh index b0f1c33c0..a208e8d26 100644 --- a/benchmarks/dsr1_fp4_mi355x_slurm.sh +++ b/benchmarks/dsr1_fp4_mi355x_slurm.sh @@ -54,140 +54,9 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ - -####### - -# -## Evals setup -# !TODO clean env vars -EVAL_RESULT_DIR=${EVAL_RESULT_DIR:-eval_out} -OPENAI_SERVER_BASE="http://0.0.0.0:${PORT}" -OPENAI_COMP_BASE="$OPENAI_SERVER_BASE/v1/completions" -OPENAI_CHAT_BASE="$OPENAI_SERVER_BASE/v1/chat/completions" -export OPENAI_API_KEY=${OPENAI_API_KEY:-EMPTY} - -# Patch to convert bypass regex error if content field is empty -PATCH_DIR="$(mktemp -d)" -cat > "$PATCH_DIR/sitecustomize.py" <<'PY' -import re, sys, unicodedata -from lm_eval.filters import extraction as ex - -def _s(x): # coerce to str - return x if isinstance(x, str) else "" - -# --- Patch RegexFilter.apply (used by many datasets) --- -_orig_regex_apply = ex.RegexFilter.apply -def _safe_regex_apply(self, resps, docs): - out = [] - for inst in resps: # inst is a list of candidate responses for one doc - filtered = [] - for resp in inst: - txt = _s(resp) - m = self.regex.findall(txt) - if m: - m = m[self.group_select] - if isinstance(m, tuple): - m = [t for t in m if t] - m = m[0] if m else self.fallback - m = m.strip() - else: - m = self.fallback - filtered.append(m) - out.append(filtered) - return out -ex.RegexFilter.apply = _safe_regex_apply - -# --- Patch MultiChoiceRegexFilter.apply (used by GSM8K flexible-extract) --- -_orig_mc_apply = ex.MultiChoiceRegexFilter.apply -def _safe_mc_apply(self, resps, docs): - def find_match(regex, resp, convert_dict={}): - txt = _s(resp) - match = regex.findall(txt) - if match: - match = match[self.group_select] - if isinstance(match, tuple): - match = [m for m in match if m] - if match: - match = match[0] - if match: - match = match.strip() - if match in convert_dict: - return convert_dict[match] - return match - return None - - punct_tbl = dict.fromkeys( - i for i in range(sys.maxunicode) - if unicodedata.category(chr(i)).startswith("P") - ) - - def filter_ignores(st): - st = _s(st) - if self.regexes_to_ignore is not None: - for s in self.regexes_to_ignore: - st = re.sub(s, "", st) - if self.ignore_case: - st = st.lower() - if self.ignore_punctuation: - st = st.translate(punct_tbl) - return st - - out = [] - for r, doc in zip(resps, docs): - # Build fallback regexes from choices (A, B, C, ...) as in upstream - fallback_regexes, choice_to_alpha = [], {} - next_alpha = "A" - without_paren, without_paren_to_target = [], {} - for c in doc.get("choices", []): - m = filter_ignores(c.strip()) - fallback_regexes.append(re.escape(m)) - choice_to_alpha[m] = f"({next_alpha})" - without_paren.append(next_alpha) - without_paren_to_target[next_alpha] = f"({next_alpha})" - next_alpha = chr(ord(next_alpha) + 1) - - fallback_regex = re.compile("|".join(fallback_regexes)) if fallback_regexes else None - without_paren_regex = re.compile(rf":[\s]*({'|'.join(without_paren)})") if without_paren else None - - filtered = [] - for resp in r: - m = find_match(self.regex, resp) - if not m and fallback_regex: - m = find_match(fallback_regex, filter_ignores(resp), choice_to_alpha) - if not m and without_paren_regex: - m = find_match(without_paren_regex, resp, without_paren_to_target) - if not m: - m = self.fallback - filtered.append(m) - out.append(filtered) - return out - -ex.MultiChoiceRegexFilter.apply = _safe_mc_apply -PY - -export PYTHONPATH="${PATCH_DIR}:${PYTHONPATH:-}" -set -x -python3 -m lm_eval --model local-chat-completions --apply_chat_template \ ---tasks ${EVAL_TASK:-gsm8k} \ ---num_fewshot ${NUM_FEWSHOT:-5} \ ---batch_size 2 \ ---output_path "/workspace/${EVAL_RESULT_DIR}" \ ---model_args "model=$MODEL,base_url=$OPENAI_CHAT_BASE,api_key=$OPENAI_API_KEY,eos_string=,max_retries=3,num_concurrent=32,tokenized_requests=False" \ ---gen_kwargs "max_tokens=8192,temperature=0,top_p=1" -set +x - -# Append a Markdown table to the GitHub Actions job summary using helper in bench_serving -if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then -python3 bench_serving/lm_eval_to_md.py \ - --results-dir "/workspace/${EVAL_RESULT_DIR}" \ - --task "${EVAL_TASK:-gsm8k}" \ - --framework "${FRAMEWORK}" \ - --precision "${PRECISION}" \ - --tp "${TP:-1}" \ - --ep "${EP_SIZE:-1}" \ - --dp-attention "${DP_ATTENTION:-false}" \ - >> "$GITHUB_STEP_SUMMARY" || true -fi - -echo "Evaluation completed. Results in /workspace/${EVAL_RESULT_DIR}" -exit 0 +# After throughput, run evaluation (defaults to GSM8K) +run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) +#MODEL_NAME="openai/$MODEL" +#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC +append_lm_eval_summary +set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_b200_docker.sh b/benchmarks/dsr1_fp8_b200_docker.sh index afb5c1655..c98e07d08 100644 --- a/benchmarks/dsr1_fp8_b200_docker.sh +++ b/benchmarks/dsr1_fp8_b200_docker.sh @@ -60,8 +60,8 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -#run_lm_eval --port "$PORT" -MODEL_NAME="openai/$MODEL" -run_eval --framework lighteval --task gsm8k --num-fewshot 5 +run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) +#MODEL_NAME="openai/$MODEL" +#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC append_lm_eval_summary set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh index a9a1a04ff..7a072ff66 100644 --- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh @@ -1,16 +1,13 @@ #!/usr/bin/env bash -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE +# === Required Env Vars === # MODEL +# TP +# CONC # ISL # OSL # MAX_MODEL_LEN # RANDOM_RANGE_RATIO -# TP -# CONC # RESULT_FILENAME # PORT_OFFSET # DP_ATTENTION @@ -88,4 +85,4 @@ run_benchmark_serving \ --num-prompts $(( $CONC * 10 )) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ + --result-dir /workspace/ \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_h200_slurm.sh b/benchmarks/dsr1_fp8_h200_slurm.sh index 4c75cc17e..9ffd81f8d 100644 --- a/benchmarks/dsr1_fp8_h200_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_slurm.sh @@ -65,7 +65,8 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -#run_lm_eval --port "$PORT" -MODEL_NAME="openai/$MODEL" -run_eval --framework lighteval --task gsm8k --num-fewshot 5 -append_lm_eval_summary \ No newline at end of file +run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) +#MODEL_NAME="openai/$MODEL" +#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC +append_lm_eval_summary +set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh index 797c2b67c..ac6bc167c 100644 --- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh @@ -1,16 +1,13 @@ #!/usr/bin/env bash -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE +# === Required Env Vars === # MODEL +# TP +# CONC # ISL # OSL # MAX_MODEL_LEN # RANDOM_RANGE_RATIO -# TP -# CONC # RESULT_FILENAME # PORT_OFFSET # DP_ATTENTION @@ -89,10 +86,3 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ - -# After throughput, run evaluation (defaults to GSM8K) -#run_lm_eval --port "$PORT" -MODEL_NAME="openai/$MODEL" -run_eval --framework lighteval --task gsm8k --num-fewshot 5 -append_lm_eval_summary -set +x diff --git a/benchmarks/dsr1_fp8_mi300x_docker.sh b/benchmarks/dsr1_fp8_mi300x_docker.sh index cec263322..9e7d58295 100644 --- a/benchmarks/dsr1_fp8_mi300x_docker.sh +++ b/benchmarks/dsr1_fp8_mi300x_docker.sh @@ -58,8 +58,8 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -#run_lm_eval --port "$PORT" -MODEL_NAME="openai/$MODEL" -run_eval --framework lighteval --task gsm8k --num-fewshot 5 +run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) +#MODEL_NAME="openai/$MODEL" +#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC append_lm_eval_summary set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_mi300x_slurm.sh b/benchmarks/dsr1_fp8_mi300x_slurm.sh index f302c942f..ff7621425 100644 --- a/benchmarks/dsr1_fp8_mi300x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi300x_slurm.sh @@ -67,7 +67,9 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ -MODEL_NAME="openai/$MODEL" -run_eval --framework lighteval --task gsm8k --num-fewshot 5 +# After throughput, run evaluation (defaults to GSM8K) +run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) +#MODEL_NAME="openai/$MODEL" +#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC append_lm_eval_summary set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_mi325x_docker.sh b/benchmarks/dsr1_fp8_mi325x_docker.sh index ffe79da5c..4231f2df7 100644 --- a/benchmarks/dsr1_fp8_mi325x_docker.sh +++ b/benchmarks/dsr1_fp8_mi325x_docker.sh @@ -48,7 +48,9 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ -MODEL_NAME="openai/$MODEL" -run_eval --framework lighteval --task gsm8k --num-fewshot 5 +# After throughput, run evaluation (defaults to GSM8K) +run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) +#MODEL_NAME="openai/$MODEL" +#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC append_lm_eval_summary set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_mi325x_slurm.sh b/benchmarks/dsr1_fp8_mi325x_slurm.sh index a19302f15..aef77d4ac 100644 --- a/benchmarks/dsr1_fp8_mi325x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi325x_slurm.sh @@ -44,8 +44,7 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -MODEL_NAME="$MODEL" -run_lm_eval --port "$PORT" --concurrent-requests $CONC +run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) #MODEL_NAME="openai/$MODEL" #run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC append_lm_eval_summary diff --git a/benchmarks/dsr1_fp8_mi355x_docker.sh b/benchmarks/dsr1_fp8_mi355x_docker.sh index a5a6eee2a..f692a2173 100644 --- a/benchmarks/dsr1_fp8_mi355x_docker.sh +++ b/benchmarks/dsr1_fp8_mi355x_docker.sh @@ -59,8 +59,8 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -#run_lm_eval --port "$PORT" -MODEL_NAME="openai/$MODEL" -run_eval --framework lighteval --task gsm8k --num-fewshot 5 +run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) +#MODEL_NAME="openai/$MODEL" +#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC append_lm_eval_summary set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_mi355x_slurm.sh b/benchmarks/dsr1_fp8_mi355x_slurm.sh index fe64e8cdd..7916371bf 100644 --- a/benchmarks/dsr1_fp8_mi355x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi355x_slurm.sh @@ -53,8 +53,8 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -#run_lm_eval --port "$PORT" -MODEL_NAME="openai/$MODEL" -run_eval --framework lighteval --task gsm8k --num-fewshot 5 +run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) +#MODEL_NAME="openai/$MODEL" +#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC append_lm_eval_summary set +x \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh index f33e23bd6..9d095a8ef 100644 --- a/benchmarks/gptoss_fp4_b200_docker.sh +++ b/benchmarks/gptoss_fp4_b200_docker.sh @@ -80,7 +80,7 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) +run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) #run_eval --framework lm-eval --port "$PORT" -run_eval --framework lighteval --task gsm8k --num-fewshot 5 append_lm_eval_summary set +x \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh index 0ec2f325f..ff9af7854 100644 --- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh @@ -1,16 +1,13 @@ #!/usr/bin/env bash -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE +# === Required Env Vars === # MODEL +# TP +# CONC # ISL # OSL # MAX_MODEL_LEN # RANDOM_RANGE_RATIO -# TP -# CONC # RESULT_FILENAME # PORT_OFFSET # DP_ATTENTION @@ -86,7 +83,6 @@ source "$(dirname "$0")/benchmark_lib.sh" # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -set -x run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ @@ -97,4 +93,4 @@ run_benchmark_serving \ --num-prompts $(( $CONC * 10 )) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ + --result-dir /workspace/ \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index 463f31b90..b5de6a296 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -61,7 +61,7 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -#run_eval --framework lm-eval --port "$PORT" -run_eval --framework lighteval --task gsm8k --num-fewshot 5 +run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) +#run_eval --framework lighteval --task gsm8k --num-fewshot 5 append_lm_eval_summary set +x \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh index 3c84888b9..6f9330f81 100644 --- a/benchmarks/gptoss_fp4_h100_slurm.sh +++ b/benchmarks/gptoss_fp4_h100_slurm.sh @@ -62,7 +62,7 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) #$(( (CONC * 3 + 1)/2 )) +run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) #run_eval --framework lighteval --task gsm8k --num-fewshot 5 append_lm_eval_summary set +x diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh index e5a3a6961..3db49afa2 100644 --- a/benchmarks/gptoss_fp4_h200_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_slurm.sh @@ -75,7 +75,7 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -#run_eval --framework lm-eval --port "$PORT" -run_eval --framework lighteval --task gsm8k --num-fewshot 5 +run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) +#run_eval --framework lighteval --task gsm8k --num-fewshot 5 append_lm_eval_summary set +x \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh index 620ddfb42..b6617ee9b 100644 --- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh @@ -1,16 +1,12 @@ #!/usr/bin/env bash -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE +# === Required Env Vars === # MODEL +# TP +# CONC # ISL # OSL -# MAX_MODEL_LEN # RANDOM_RANGE_RATIO -# TP -# CONC # RESULT_FILENAME # PORT_OFFSET # DP_ATTENTION @@ -50,12 +46,12 @@ trtllm-serve $MODEL \ --max_num_tokens 20000 \ --backend pytorch \ --extra_llm_api_options gptoss-config.yml \ ---ep_size=1 \ +--ep_size=$EP_SIZE \ --trust_remote_code \ --gpus_per_node 8 \ --host 0.0.0.0 \ --port $PORT \ ---tp_size=1 \ +--tp_size=$TP \ --pp_size=1 \ > $SERVER_LOG 2>&1 & @@ -77,8 +73,4 @@ run_benchmark_serving \ --num-prompts $(( $CONC * 10 )) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ - -# After throughput, run evaluation (defaults to GSM8K) -#run_lm_eval --port "$PORT" -append_lm_eval_summary \ No newline at end of file + --result-dir /workspace/ \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh index 0dade438d..95b4678de 100644 --- a/benchmarks/gptoss_fp4_mi300x_docker.sh +++ b/benchmarks/gptoss_fp4_mi300x_docker.sh @@ -62,7 +62,7 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -#run_eval --framework lm-eval --port "$PORT" -run_eval --framework lighteval --task gsm8k --num-fewshot 5 +run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) +#run_eval --framework lighteval --task gsm8k --num-fewshot 5 append_lm_eval_summary set +x \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_mi300x_slurm.sh b/benchmarks/gptoss_fp4_mi300x_slurm.sh index 8dbeefcf2..fe287eb50 100644 --- a/benchmarks/gptoss_fp4_mi300x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi300x_slurm.sh @@ -71,6 +71,6 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -#run_lm_eval --port "$PORT" +run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) append_lm_eval_summary set +x \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_mi325x_docker.sh b/benchmarks/gptoss_fp4_mi325x_docker.sh index db330a1b8..aa42c2888 100644 --- a/benchmarks/gptoss_fp4_mi325x_docker.sh +++ b/benchmarks/gptoss_fp4_mi325x_docker.sh @@ -58,6 +58,6 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -#run_lm_eval --port "$PORT" +run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) append_lm_eval_summary set +x \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh index 40ed241f7..a49729bb5 100644 --- a/benchmarks/gptoss_fp4_mi325x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh @@ -1,16 +1,13 @@ #!/usr/bin/bash -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE +# === Required Env Vars === # MODEL +# TP +# CONC # ISL # OSL # MAX_MODEL_LEN # RANDOM_RANGE_RATIO -# TP -# CONC # RESULT_FILENAME echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh index 305210cf7..a04bbdba8 100644 --- a/benchmarks/gptoss_fp4_mi355x_docker.sh +++ b/benchmarks/gptoss_fp4_mi355x_docker.sh @@ -60,7 +60,7 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -#run_eval --framework lm-eval --port "$PORT" -run_eval --framework lighteval --task gsm8k --num-fewshot 5 +run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) +#run_eval --framework lighteval --task gsm8k --num-fewshot 5 append_lm_eval_summary set +x \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_mi355x_slurm.sh b/benchmarks/gptoss_fp4_mi355x_slurm.sh index a999303b1..b364c4758 100644 --- a/benchmarks/gptoss_fp4_mi355x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi355x_slurm.sh @@ -61,6 +61,6 @@ run_benchmark_serving \ --result-dir /workspace/ # After throughput, run evaluation (defaults to GSM8K) -#run_lm_eval --port "$PORT" +run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) append_lm_eval_summary set +x \ No newline at end of file From 688e2c52231d77795c9ef3e72580ec43f0d8440c Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 28 Nov 2025 13:46:21 +0800 Subject: [PATCH 146/214] all change, test deepseek --- .github/workflows/eval-gms8k.yml | 2 +- benchmarks/benchmark_lib.sh | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 3e71d6885..081f94120 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -55,7 +55,7 @@ jobs: framework: sglang precision: fp8 exp-name: ${{ inputs.exp-name || 'dsr1_gsm8k_poc' }} - tp: '8' + tp: '8' ep: '1' dp-attn: false port: ${{ inputs.port || '8888' }} diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 29698e52e..83e48a352 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -290,6 +290,7 @@ run_lm_eval() { local openai_server_base="http://0.0.0.0:${port}" local openai_chat_base="${openai_server_base}/v1/chat/completions" export OPENAI_API_KEY=${OPENAI_API_KEY:-EMPTY} + MODEL_NAME=${MODEL_NAME:-$MODEL} # Prefer MODEL_NAME, else MODEL set -x python3 -m lm_eval --model local-chat-completions --apply_chat_template \ From 6b320ce8f1befeb539c5e9e55d6c27eb5ca7f914 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 28 Nov 2025 18:58:54 +0800 Subject: [PATCH 147/214] retest mi325x --- benchmarks/benchmark_lib.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 83e48a352..7d457530d 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -141,6 +141,7 @@ _patch_lm_eval_filters() { cat > "$patch_dir/sitecustomize.py" <<'PY' import re, sys, unicodedata from lm_eval.filters import extraction as ex +from lm_eval.models.openai_completions import LocalChatCompletion as _LCC def _s(x): # coerce to str return x if isinstance(x, str) else "" @@ -265,7 +266,7 @@ run_lm_eval() { local task="${EVAL_TASK:-gsm8k}" local num_fewshot="${NUM_FEWSHOT:-5}" local results_dir="${EVAL_RESULT_DIR:-eval_out}" - local gen_max_tokens=4096 + local gen_max_tokens=1024 local temperature=0 local top_p=1 local concurrent_requests=32 From 9768deaf7fc1933270d81b013d58d4c3704f1642 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 28 Nov 2025 19:02:45 +0800 Subject: [PATCH 148/214] test b200 --- .github/workflows/eval-gms8k.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 081f94120..0f0420fab 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -49,13 +49,13 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: mi325x-tw_0 - image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' }} - model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }} + runner: b200-nvd_2 + image: ${{ inputs.image || 'lmsysorg/sglang:v0.5.5-cu129-amd64' }} + model: ${{ inputs.model || 'nvidia/DeepSeek-R1-0528-FP4-V2' }} framework: sglang - precision: fp8 + precision: fp4 exp-name: ${{ inputs.exp-name || 'dsr1_gsm8k_poc' }} - tp: '8' + tp: '8' ep: '1' dp-attn: false port: ${{ inputs.port || '8888' }} From 4c339b4637a0163cf6e2bf64867fcb4375cf6be9 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 28 Nov 2025 19:38:02 +0800 Subject: [PATCH 149/214] clean b200 --- .github/workflows/eval-tmpl.yml | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml index 6e0d3bbb7..d2b43f569 100644 --- a/.github/workflows/eval-tmpl.yml +++ b/.github/workflows/eval-tmpl.yml @@ -79,6 +79,7 @@ jobs: steps: - name: Resource cleanup run: | + sudo rm -rf /home/nvadmin/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out/ # Helper to avoid indefinite hangs on flaky tools (Docker/Slurm) safe_timeout() { if command -v timeout >/dev/null 2>&1; then @@ -158,9 +159,4 @@ jobs: - name: Resource cleanup run: | - pkill -f litellm || true - pkill -f lighteval || true - sleep 2 - - sudo rm -rf .litellm_cache sudo rm -rf eval_out* \ No newline at end of file From efe94aa85a1f11c2c2234ed71b1d458b943604a8 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 28 Nov 2025 19:56:52 +0800 Subject: [PATCH 150/214] test h200 --- .github/workflows/eval-gms8k.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 0f0420fab..92226ffad 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -49,11 +49,11 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: b200-nvd_2 + runner: h200-cw_1 image: ${{ inputs.image || 'lmsysorg/sglang:v0.5.5-cu129-amd64' }} - model: ${{ inputs.model || 'nvidia/DeepSeek-R1-0528-FP4-V2' }} + model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }} framework: sglang - precision: fp4 + precision: fp8 exp-name: ${{ inputs.exp-name || 'dsr1_gsm8k_poc' }} tp: '8' ep: '1' From 705fc106e982ac4ced02a8a6c984785e0b6cf257 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 28 Nov 2025 20:42:30 +0800 Subject: [PATCH 151/214] H200 test --- .github/workflows/eval-gms8k.yml | 2 +- benchmarks/benchmark_lib.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 92226ffad..f7658eaa9 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -52,7 +52,7 @@ jobs: runner: h200-cw_1 image: ${{ inputs.image || 'lmsysorg/sglang:v0.5.5-cu129-amd64' }} model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }} - framework: sglang + framework: sglang precision: fp8 exp-name: ${{ inputs.exp-name || 'dsr1_gsm8k_poc' }} tp: '8' diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 7d457530d..529dbfa6e 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -266,7 +266,7 @@ run_lm_eval() { local task="${EVAL_TASK:-gsm8k}" local num_fewshot="${NUM_FEWSHOT:-5}" local results_dir="${EVAL_RESULT_DIR:-eval_out}" - local gen_max_tokens=1024 + local gen_max_tokens=4096 local temperature=0 local top_p=1 local concurrent_requests=32 From f79f243389d314dfd7698a5644cae58dbbc8a6a5 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 28 Nov 2025 23:52:06 +0800 Subject: [PATCH 152/214] B200-nvd2 sleep --- .github/workflows/drain-b200-nvd2.yml | 36 +++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 .github/workflows/drain-b200-nvd2.yml diff --git a/.github/workflows/drain-b200-nvd2.yml b/.github/workflows/drain-b200-nvd2.yml new file mode 100644 index 000000000..a7aeae576 --- /dev/null +++ b/.github/workflows/drain-b200-nvd2.yml @@ -0,0 +1,36 @@ +name: Drain b200-nvd_2 + +on: + workflow_dispatch: + inputs: + minutes: + description: Minutes to hold (defaults to 72h if empty) + required: false + default: "" + +jobs: + hold: + # Pin specifically to the self-hosted runner label for b200-nvd_2 + runs-on: [b200-nvd_2] + # Many orgs cap at 72h; adjust if your org allows more + timeout-minutes: ${{ fromJSON(github.event.inputs.minutes || '0') > 0 && fromJSON(github.event.inputs.minutes) || 4320 }} + steps: + - name: Start drain + shell: bash + run: | + set -euo pipefail + echo "Holding runner: $RUNNER_NAME" + echo "Runner OS/Arch: $RUNNER_OS / $RUNNER_ARCH" + echo "Started at: $(date -Iseconds)" + echo "Cancel this workflow run to release the runner." + + - name: Hold indefinitely (until timeout or cancel) + shell: bash + run: | + set -euo pipefail + trap 'echo "Release signal received at $(date -Iseconds)"; exit 0' INT TERM + while true; do + echo "Still holding at $(date -Iseconds)" + sleep 1800 + done + From d9a4fede00258ad508280362b69994ae704dc057 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 28 Nov 2025 23:58:49 +0800 Subject: [PATCH 153/214] B200-nvd2 sleep --- .github/workflows/drain-b200-nvd2.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/drain-b200-nvd2.yml b/.github/workflows/drain-b200-nvd2.yml index a7aeae576..41bf0f027 100644 --- a/.github/workflows/drain-b200-nvd2.yml +++ b/.github/workflows/drain-b200-nvd2.yml @@ -11,9 +11,9 @@ on: jobs: hold: # Pin specifically to the self-hosted runner label for b200-nvd_2 - runs-on: [b200-nvd_2] - # Many orgs cap at 72h; adjust if your org allows more - timeout-minutes: ${{ fromJSON(github.event.inputs.minutes || '0') > 0 && fromJSON(github.event.inputs.minutes) || 4320 }} + runs-on: [self-hosted, b200-nvd_2] + # Hold for 24h by default (override by canceling anytime) + timeout-minutes: 1440 steps: - name: Start drain shell: bash @@ -33,4 +33,3 @@ jobs: echo "Still holding at $(date -Iseconds)" sleep 1800 done - From 8c6b944ed5c939b74428bf1439600060f67b7cdf Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 29 Nov 2025 00:00:04 +0800 Subject: [PATCH 154/214] B200-nvd2 sleep --- .github/workflows/drain-b200-nvd2.yml | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/.github/workflows/drain-b200-nvd2.yml b/.github/workflows/drain-b200-nvd2.yml index 41bf0f027..08646c3b4 100644 --- a/.github/workflows/drain-b200-nvd2.yml +++ b/.github/workflows/drain-b200-nvd2.yml @@ -1,12 +1,9 @@ name: Drain b200-nvd_2 on: - workflow_dispatch: - inputs: - minutes: - description: Minutes to hold (defaults to 72h if empty) - required: false - default: "" + push: + paths: + - '.github/workflows/drain-b200-nvd2.yml' jobs: hold: From 28a026fd68b465f0e0b6e05e26310acf4cfc4f4d Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 29 Nov 2025 02:36:22 +0800 Subject: [PATCH 155/214] mi325x test --- .github/workflows/eval-gms8k.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index f7658eaa9..0734aca4c 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -49,10 +49,10 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: h200-cw_1 - image: ${{ inputs.image || 'lmsysorg/sglang:v0.5.5-cu129-amd64' }} + runner: mi325x-tw_1 + image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' }} model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }} - framework: sglang + framework: sglang precision: fp8 exp-name: ${{ inputs.exp-name || 'dsr1_gsm8k_poc' }} tp: '8' From c4bd3d2e6b086644777b90cd620f69a6e6f3a9e2 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 29 Nov 2025 02:37:42 +0800 Subject: [PATCH 156/214] mi325x test, no text, no empty fix --- .github/workflows/eval-gms8k.yml | 2 +- benchmarks/benchmark_lib.sh | 132 ++++++++++--------------------- 2 files changed, 41 insertions(+), 93 deletions(-) diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 0734aca4c..3e71d6885 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -49,7 +49,7 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: mi325x-tw_1 + runner: mi325x-tw_0 image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' }} model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }} framework: sglang diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 529dbfa6e..5565756ca 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -133,107 +133,17 @@ _install_lm_eval_deps() { "git+https://github.com/EleutherAI/lm-evaluation-harness.git@main" || true } +# Patch lm-eval filters to be robust to empty strings via sitecustomize # Patch lm-eval filters to be robust to empty strings via sitecustomize _patch_lm_eval_filters() { set +x local patch_dir patch_dir="$(mktemp -d)" cat > "$patch_dir/sitecustomize.py" <<'PY' -import re, sys, unicodedata +import re, sys, unicodedata, json from lm_eval.filters import extraction as ex from lm_eval.models.openai_completions import LocalChatCompletion as _LCC -def _s(x): # coerce to str - return x if isinstance(x, str) else "" - -# --- Patch RegexFilter.apply --- -_orig_regex_apply = ex.RegexFilter.apply -def _safe_regex_apply(self, resps, docs): - out = [] - for inst in resps: - filtered = [] - for resp in inst: - txt = _s(resp) - m = self.regex.findall(txt) - if m: - m = m[self.group_select] - if isinstance(m, tuple): - m = [t for t in m if t] - m = m[0] if m else self.fallback - m = m.strip() - else: - m = self.fallback - filtered.append(m) - out.append(filtered) - return out -ex.RegexFilter.apply = _safe_regex_apply - -# --- Patch MultiChoiceRegexFilter.apply --- -_orig_mc_apply = ex.MultiChoiceRegexFilter.apply -def _safe_mc_apply(self, resps, docs): - def find_match(regex, resp, convert_dict={}): - txt = _s(resp) - match = regex.findall(txt) - if match: - match = match[self.group_select] - if isinstance(match, tuple): - match = [m for m in match if m] - if match: - match = match[0] - if match: - match = match.strip() - if match in convert_dict: - return convert_dict[match] - return match - return None - - punct_tbl = dict.fromkeys( - i for i in range(sys.maxunicode) - if unicodedata.category(chr(i)).startswith("P") - ) - - def filter_ignores(st): - st = _s(st) - if self.regexes_to_ignore is not None: - for s in self.regexes_to_ignore: - st = re.sub(s, "", st) - if self.ignore_case: - st = st.lower() - if self.ignore_punctuation: - st = st.translate(punct_tbl) - return st - - out = [] - for r, doc in zip(resps, docs): - fallback_regexes, choice_to_alpha = [], {} - next_alpha = "A" - without_paren, without_paren_to_target = [], {} - for c in doc.get("choices", []): - m = filter_ignores(c.strip()) - fallback_regexes.append(re.escape(m)) - choice_to_alpha[m] = f"({next_alpha})" - without_paren.append(next_alpha) - without_paren_to_target[next_alpha] = f"({next_alpha})" - next_alpha = chr(ord(next_alpha) + 1) - - fallback_regex = re.compile("|".join(fallback_regexes)) if fallback_regexes else None - without_paren_regex = re.compile(rf":[\s]*({'|'.join(without_paren)})") if without_paren else None - - filtered = [] - for resp in r: - m = find_match(self.regex, resp) - if not m and fallback_regex: - m = find_match(fallback_regex, filter_ignores(resp), choice_to_alpha) - if not m and without_paren_regex: - m = find_match(without_paren_regex, resp, without_paren_to_target) - if not m: - m = self.fallback - filtered.append(m) - out.append(filtered) - return out - -ex.MultiChoiceRegexFilter.apply = _safe_mc_apply - def _le_parse_generations(outputs, **kwargs): res = [] if not isinstance(outputs, list): @@ -256,6 +166,44 @@ def _le_parse_generations(outputs, **kwargs): # Keep staticmethod semantics _LCC.parse_generations = staticmethod(_le_parse_generations) + +# --- Patch TemplateAPI.apply_chat_template to avoid injecting "type": "text" --- +try: + from lm_eval.models import api_models as _api_models + _TemplateAPI = _api_models.TemplateAPI + _JsonChatStr = _api_models.JsonChatStr +except Exception: + _TemplateAPI = None + _JsonChatStr = None + +if _TemplateAPI is not None and _JsonChatStr is not None: + _orig_apply_chat_template = _TemplateAPI.apply_chat_template + + def _patched_apply_chat_template( + self, + chat_history, + add_generation_prompt: bool = True, + ): + """Applies a chat template to a list of chat history between user and model.""" + if self.tokenizer_backend == "huggingface" and self.tokenized_requests: + return self.tokenizer.apply_chat_template( + chat_history, + tokenize=False, + add_generation_prompt=add_generation_prompt, + continue_final_message=not add_generation_prompt, + ) + elif self.tokenizer_backend == "remote" and self.tokenized_requests: + return chat_history + else: + # NOTE: we no longer inject `"type": "text"` when tokenizer is None / non-HF + return _JsonChatStr( + json.dumps( + [{**item} for item in chat_history], + ensure_ascii=False, + ) + ) + + _TemplateAPI.apply_chat_template = _patched_apply_chat_template PY export PYTHONPATH="${patch_dir}:${PYTHONPATH:-}" } From 14068bc89655cd893b46a49459001273f3f9c813 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 29 Nov 2025 17:34:28 +0800 Subject: [PATCH 157/214] h100, tmp eval_out --- .github/workflows/benchmark-tmpl.yml | 6 ++- .github/workflows/eval-gms8k.yml | 14 ++--- .github/workflows/eval-tmpl.yml | 6 +-- .github/workflows/full-sweep-test.yml | 6 +++ benchmarks/benchmark_lib.sh | 15 +++--- benchmarks/dsr1_fp4_b200_docker.sh | 10 ++-- benchmarks/dsr1_fp4_mi355x_docker.sh | 10 ++-- benchmarks/dsr1_fp4_mi355x_slurm.sh | 10 ++-- benchmarks/dsr1_fp8_b200_docker.sh | 10 ++-- benchmarks/dsr1_fp8_h200_slurm.sh | 10 ++-- benchmarks/dsr1_fp8_mi300x_docker.sh | 10 ++-- benchmarks/dsr1_fp8_mi300x_slurm.sh | 10 ++-- benchmarks/dsr1_fp8_mi325x_docker.sh | 10 ++-- benchmarks/dsr1_fp8_mi325x_slurm.sh | 10 ++-- benchmarks/dsr1_fp8_mi355x_docker.sh | 10 ++-- benchmarks/dsr1_fp8_mi355x_slurm.sh | 10 ++-- benchmarks/gptoss_fp4_b200_docker.sh | 9 ++-- benchmarks/gptoss_fp4_h100_docker.sh | 9 ++-- benchmarks/gptoss_fp4_h100_slurm.sh | 11 ++-- benchmarks/gptoss_fp4_h200_slurm.sh | 9 ++-- benchmarks/gptoss_fp4_mi300x_docker.sh | 9 ++-- benchmarks/gptoss_fp4_mi300x_slurm.sh | 8 +-- benchmarks/gptoss_fp4_mi325x_docker.sh | 8 +-- benchmarks/gptoss_fp4_mi325x_slurm.sh | 9 ++-- benchmarks/gptoss_fp4_mi355x_docker.sh | 9 ++-- benchmarks/gptoss_fp4_mi355x_slurm.sh | 8 +-- utils/matrix-logic/generate_sweep_configs.py | 54 ++++++++++++++++++++ 27 files changed, 188 insertions(+), 112 deletions(-) diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 4496ac001..436b156a8 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -45,7 +45,10 @@ on: required: false type: string default: '0.8' - + run-eval: + type: boolean + required: false + default: false env: HF_TOKEN: ${{ secrets.HF_TOKEN }} HF_HUB_CACHE: '/mnt/hf_hub_cache/' @@ -62,6 +65,7 @@ env: EP_SIZE: ${{ inputs.ep }} DP_ATTENTION: ${{ inputs.dp-attn }} CONC: ${{ inputs.conc }} + RUN_EVAL: ${{ inputs.run-eval }} permissions: contents: read diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml index 3e71d6885..5a7e7e823 100644 --- a/.github/workflows/eval-gms8k.yml +++ b/.github/workflows/eval-gms8k.yml @@ -49,13 +49,13 @@ jobs: uses: ./.github/workflows/eval-tmpl.yml secrets: inherit with: - runner: mi325x-tw_0 - image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' }} - model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }} - framework: sglang - precision: fp8 - exp-name: ${{ inputs.exp-name || 'dsr1_gsm8k_poc' }} - tp: '8' + runner: h100-cw_0 + image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} + model: ${{ inputs.model || 'openai/gpt-oss-120b' }} + framework: vllm + precision: fp4 + exp-name: ${{ inputs.exp-name || 'gptoss_gsm8k_poc' }} + tp: '4' ep: '1' dp-attn: false port: ${{ inputs.port || '8888' }} diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml index d2b43f569..63c4164f1 100644 --- a/.github/workflows/eval-tmpl.yml +++ b/.github/workflows/eval-tmpl.yml @@ -155,8 +155,4 @@ jobs: path: | ${{ env.EVAL_RESULT_DIR }}/ ${{ env.EVAL_RESULT_DIR }}/* - ${{ env.EVAL_RESULT_DIR }}/** - - - name: Resource cleanup - run: | - sudo rm -rf eval_out* \ No newline at end of file + ${{ env.EVAL_RESULT_DIR }}/** \ No newline at end of file diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml index cf889e208..01bae1f5d 100644 --- a/.github/workflows/full-sweep-test.yml +++ b/.github/workflows/full-sweep-test.yml @@ -130,6 +130,7 @@ jobs: ep: ${{ matrix.config.ep }} dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} + run-eval: ${{ matrix.config.run-eval }} collect-dsr1-1k1k-results: needs: benchmark-dsr1-1k1k @@ -164,6 +165,7 @@ jobs: ep: ${{ matrix.config.ep }} dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} + run-eval: ${{ matrix.config.run-eval }} collect-gptoss-1k1k-results: needs: benchmark-gptoss-1k1k @@ -198,6 +200,7 @@ jobs: ep: ${{ matrix.config.ep }} dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} + run-eval: ${{ matrix.config.run-eval }} collect-dsr1-8k1k-results: needs: benchmark-dsr1-8k1k @@ -232,6 +235,7 @@ jobs: ep: ${{ matrix.config.ep }} dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} + run-eval: ${{ matrix.config.run-eval }} collect-gptoss-8k1k-results: needs: benchmark-gptoss-8k1k @@ -266,6 +270,7 @@ jobs: ep: ${{ matrix.config.ep }} dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} + run-eval: ${{ matrix.config.run-eval }} # This is a workaround until we can integrate GB200 into master configs. benchmark-gb200-1k1k: @@ -394,6 +399,7 @@ jobs: ep: ${{ matrix.config.ep }} dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} + run-eval: ${{ matrix.config.run-eval }} collect-gptoss-1k8k-results: needs: benchmark-gptoss-1k8k diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 5565756ca..5f2b51ec7 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -213,7 +213,7 @@ run_lm_eval() { local port="${PORT:-8888}" local task="${EVAL_TASK:-gsm8k}" local num_fewshot="${NUM_FEWSHOT:-5}" - local results_dir="${EVAL_RESULT_DIR:-eval_out}" + local results_dir="${EVAL_RESULT_DIR:-$(mktemp -d /tmp/eval_out-XXXXXX)}" local gen_max_tokens=4096 local temperature=0 local top_p=1 @@ -241,11 +241,14 @@ run_lm_eval() { export OPENAI_API_KEY=${OPENAI_API_KEY:-EMPTY} MODEL_NAME=${MODEL_NAME:-$MODEL} # Prefer MODEL_NAME, else MODEL + # Export for append_lm_eval_summary to pick up + export EVAL_RESULT_DIR="$results_dir" + set -x python3 -m lm_eval --model local-chat-completions --apply_chat_template \ --tasks "utils/evals/${task}.yaml" \ --num_fewshot "${num_fewshot}" \ - --output_path "/workspace/${results_dir}" \ + --output_path "${results_dir}" \ --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=,max_retries=2,num_concurrent=${concurrent_requests},tokenized_requests=False" \ --gen_kwargs "max_tokens=${gen_max_tokens},temperature=${temperature},top_p=${top_p}" set +x @@ -253,10 +256,9 @@ run_lm_eval() { append_lm_eval_summary() { set +x - local results_dir="${EVAL_RESULT_DIR:-eval_out}" + local results_dir="${EVAL_RESULT_DIR}" local task="${EVAL_TASK:-gsm8k}" - # Always render a local summary so the runner can pick it up - local out_dir="/workspace/${results_dir}" + local out_dir="${results_dir}" local summary_md="${out_dir}/SUMMARY.md" mkdir -p "$out_dir" || true @@ -278,8 +280,9 @@ append_lm_eval_summary() { cat "$summary_md" >> "$GITHUB_STEP_SUMMARY" || true fi fi -} + echo "Results saved to: ${summary_md}" +} # ------------------------------ # Lighteval + LiteLLM patching diff --git a/benchmarks/dsr1_fp4_b200_docker.sh b/benchmarks/dsr1_fp4_b200_docker.sh index 08469715e..31319c44e 100644 --- a/benchmarks/dsr1_fp4_b200_docker.sh +++ b/benchmarks/dsr1_fp4_b200_docker.sh @@ -48,9 +48,9 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ -# After throughput, run evaluation (defaults to GSM8K) -run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) -#MODEL_NAME="openai/$MODEL" -#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC -append_lm_eval_summary +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + append_lm_eval_summary +fi set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp4_mi355x_docker.sh b/benchmarks/dsr1_fp4_mi355x_docker.sh index 2c5bd1e42..bad7c2854 100644 --- a/benchmarks/dsr1_fp4_mi355x_docker.sh +++ b/benchmarks/dsr1_fp4_mi355x_docker.sh @@ -51,9 +51,9 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ -# After throughput, run evaluation (defaults to GSM8K) -run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) -#MODEL_NAME="openai/$MODEL" -#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC -append_lm_eval_summary +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + append_lm_eval_summary +fi set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp4_mi355x_slurm.sh b/benchmarks/dsr1_fp4_mi355x_slurm.sh index a208e8d26..31d6a94a9 100644 --- a/benchmarks/dsr1_fp4_mi355x_slurm.sh +++ b/benchmarks/dsr1_fp4_mi355x_slurm.sh @@ -54,9 +54,9 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ -# After throughput, run evaluation (defaults to GSM8K) -run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) -#MODEL_NAME="openai/$MODEL" -#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC -append_lm_eval_summary +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + append_lm_eval_summary +fi set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_b200_docker.sh b/benchmarks/dsr1_fp8_b200_docker.sh index c98e07d08..73696ff4a 100644 --- a/benchmarks/dsr1_fp8_b200_docker.sh +++ b/benchmarks/dsr1_fp8_b200_docker.sh @@ -59,9 +59,9 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ -# After throughput, run evaluation (defaults to GSM8K) -run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) -#MODEL_NAME="openai/$MODEL" -#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC -append_lm_eval_summary +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + append_lm_eval_summary +fi set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_h200_slurm.sh b/benchmarks/dsr1_fp8_h200_slurm.sh index 9ffd81f8d..088e71c0a 100644 --- a/benchmarks/dsr1_fp8_h200_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_slurm.sh @@ -64,9 +64,9 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ -# After throughput, run evaluation (defaults to GSM8K) -run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) -#MODEL_NAME="openai/$MODEL" -#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC -append_lm_eval_summary +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + append_lm_eval_summary +fi set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_mi300x_docker.sh b/benchmarks/dsr1_fp8_mi300x_docker.sh index 9e7d58295..8027ae1eb 100644 --- a/benchmarks/dsr1_fp8_mi300x_docker.sh +++ b/benchmarks/dsr1_fp8_mi300x_docker.sh @@ -57,9 +57,9 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ -# After throughput, run evaluation (defaults to GSM8K) -run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) -#MODEL_NAME="openai/$MODEL" -#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC -append_lm_eval_summary +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + append_lm_eval_summary +fi set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_mi300x_slurm.sh b/benchmarks/dsr1_fp8_mi300x_slurm.sh index ff7621425..7e222726c 100644 --- a/benchmarks/dsr1_fp8_mi300x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi300x_slurm.sh @@ -67,9 +67,9 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ -# After throughput, run evaluation (defaults to GSM8K) -run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) -#MODEL_NAME="openai/$MODEL" -#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC -append_lm_eval_summary +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + append_lm_eval_summary +fi set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_mi325x_docker.sh b/benchmarks/dsr1_fp8_mi325x_docker.sh index 4231f2df7..940d4b076 100644 --- a/benchmarks/dsr1_fp8_mi325x_docker.sh +++ b/benchmarks/dsr1_fp8_mi325x_docker.sh @@ -48,9 +48,9 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ -# After throughput, run evaluation (defaults to GSM8K) -run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) -#MODEL_NAME="openai/$MODEL" -#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC -append_lm_eval_summary +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + append_lm_eval_summary +fi set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_mi325x_slurm.sh b/benchmarks/dsr1_fp8_mi325x_slurm.sh index aef77d4ac..8adf3d745 100644 --- a/benchmarks/dsr1_fp8_mi325x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi325x_slurm.sh @@ -43,9 +43,9 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ -# After throughput, run evaluation (defaults to GSM8K) -run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) -#MODEL_NAME="openai/$MODEL" -#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC -append_lm_eval_summary +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + append_lm_eval_summary +fi set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_mi355x_docker.sh b/benchmarks/dsr1_fp8_mi355x_docker.sh index f692a2173..5aa0afd3e 100644 --- a/benchmarks/dsr1_fp8_mi355x_docker.sh +++ b/benchmarks/dsr1_fp8_mi355x_docker.sh @@ -58,9 +58,9 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ -# After throughput, run evaluation (defaults to GSM8K) -run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) -#MODEL_NAME="openai/$MODEL" -#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC -append_lm_eval_summary +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + append_lm_eval_summary +fi set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_mi355x_slurm.sh b/benchmarks/dsr1_fp8_mi355x_slurm.sh index 7916371bf..ec734d1c6 100644 --- a/benchmarks/dsr1_fp8_mi355x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi355x_slurm.sh @@ -52,9 +52,9 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ -# After throughput, run evaluation (defaults to GSM8K) -run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) -#MODEL_NAME="openai/$MODEL" -#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC -append_lm_eval_summary +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + append_lm_eval_summary +fi set +x \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh index 9d095a8ef..64613e11b 100644 --- a/benchmarks/gptoss_fp4_b200_docker.sh +++ b/benchmarks/gptoss_fp4_b200_docker.sh @@ -79,8 +79,9 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ -# After throughput, run evaluation (defaults to GSM8K) -run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) -#run_eval --framework lm-eval --port "$PORT" -append_lm_eval_summary +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + append_lm_eval_summary +fi set +x \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index b5de6a296..dd552d8c3 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -60,8 +60,9 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ -# After throughput, run evaluation (defaults to GSM8K) -run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) -#run_eval --framework lighteval --task gsm8k --num-fewshot 5 -append_lm_eval_summary +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + append_lm_eval_summary +fi set +x \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh index 6f9330f81..5922220ac 100644 --- a/benchmarks/gptoss_fp4_h100_slurm.sh +++ b/benchmarks/gptoss_fp4_h100_slurm.sh @@ -61,8 +61,9 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ -# After throughput, run evaluation (defaults to GSM8K) -run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) -#run_eval --framework lighteval --task gsm8k --num-fewshot 5 -append_lm_eval_summary -set +x +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + append_lm_eval_summary +fi +set +x \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh index 3db49afa2..7ca2ab001 100644 --- a/benchmarks/gptoss_fp4_h200_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_slurm.sh @@ -74,8 +74,9 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ -# After throughput, run evaluation (defaults to GSM8K) -run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) -#run_eval --framework lighteval --task gsm8k --num-fewshot 5 -append_lm_eval_summary +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + append_lm_eval_summary +fi set +x \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh index 95b4678de..90b8eb42b 100644 --- a/benchmarks/gptoss_fp4_mi300x_docker.sh +++ b/benchmarks/gptoss_fp4_mi300x_docker.sh @@ -61,8 +61,9 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ -# After throughput, run evaluation (defaults to GSM8K) -run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) -#run_eval --framework lighteval --task gsm8k --num-fewshot 5 -append_lm_eval_summary +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + append_lm_eval_summary +fi set +x \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_mi300x_slurm.sh b/benchmarks/gptoss_fp4_mi300x_slurm.sh index fe287eb50..d97355818 100644 --- a/benchmarks/gptoss_fp4_mi300x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi300x_slurm.sh @@ -70,7 +70,9 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ -# After throughput, run evaluation (defaults to GSM8K) -run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) -append_lm_eval_summary +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + append_lm_eval_summary +fi set +x \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_mi325x_docker.sh b/benchmarks/gptoss_fp4_mi325x_docker.sh index aa42c2888..a5cd09637 100644 --- a/benchmarks/gptoss_fp4_mi325x_docker.sh +++ b/benchmarks/gptoss_fp4_mi325x_docker.sh @@ -57,7 +57,9 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ -# After throughput, run evaluation (defaults to GSM8K) -run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) -append_lm_eval_summary +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + append_lm_eval_summary +fi set +x \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh index a49729bb5..80b58b45b 100644 --- a/benchmarks/gptoss_fp4_mi325x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh @@ -66,8 +66,9 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ -# After throughput, run evaluation (defaults to GSM8K) -run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) -#run_eval --framework lighteval --task gsm8k --num-fewshot 5 -append_lm_eval_summary +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + append_lm_eval_summary +fi set +x \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh index a04bbdba8..e75b06d15 100644 --- a/benchmarks/gptoss_fp4_mi355x_docker.sh +++ b/benchmarks/gptoss_fp4_mi355x_docker.sh @@ -59,8 +59,9 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ -# After throughput, run evaluation (defaults to GSM8K) -run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) -#run_eval --framework lighteval --task gsm8k --num-fewshot 5 -append_lm_eval_summary +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + append_lm_eval_summary +fi set +x \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_mi355x_slurm.sh b/benchmarks/gptoss_fp4_mi355x_slurm.sh index b364c4758..845398ca0 100644 --- a/benchmarks/gptoss_fp4_mi355x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi355x_slurm.sh @@ -60,7 +60,9 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ -# After throughput, run evaluation (defaults to GSM8K) -run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) -append_lm_eval_summary +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + append_lm_eval_summary +fi set +x \ No newline at end of file diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py index f0d9a4390..1110e3c30 100644 --- a/utils/matrix-logic/generate_sweep_configs.py +++ b/utils/matrix-logic/generate_sweep_configs.py @@ -31,6 +31,9 @@ FIELD_MAX_MODEL_LEN = 'max-model-len' FIELD_EXP_NAME = 'exp-name' +# Eval +FIELD_RUN_EVAL = 'run-eval' + seq_len_stoi = { "1k1k": (1024, 1024), "1k8k": (1024, 8192), @@ -67,6 +70,7 @@ class MatrixEntry(BaseModel): conc: int max_model_len: int = Field(alias='max-model-len') exp_name: str = Field(alias='exp-name') + run_eval: bool = Field(alias='run-eval', default=False) def validate_matrix_output(matrix_values: List[dict]) -> List[dict]: @@ -82,6 +86,53 @@ def validate_matrix_output(matrix_values: List[dict]) -> List[dict]: raise ValueError(f"Matrix entry at index {i} failed validation:\n{e}") return matrix_values +def mark_eval_entries(matrix_values: List[dict]) -> List[dict]: + """Mark entries that should run evaluation. + + For each unique (model, runner, isl, osl) combination: + - Mark highest TP with highest conc + - Mark lowest TP with highest conc + """ + from collections import defaultdict + + # Group entries by (model, runner, isl, osl) + groups = defaultdict(list) + for i, entry in enumerate(matrix_values): + key = (entry[FIELD_MODEL], entry[FIELD_RUNNER], entry[FIELD_ISL], entry[FIELD_OSL]) + groups[key].append((i, entry)) + + # For each group, find highest TP/highest conc and lowest TP/highest conc + eval_indices = set() + for key, entries in groups.items(): + if not entries: + continue + + # Find min and max TP values + min_tp = min(e[FIELD_TP] for _, e in entries) + max_tp = max(e[FIELD_TP] for _, e in entries) + + # Find highest conc for highest TP + highest_tp_entries = [(i, e) for i, e in entries if e[FIELD_TP] == max_tp] + if highest_tp_entries: + max_conc_highest_tp = max(e[FIELD_CONC] for _, e in highest_tp_entries) + for i, e in highest_tp_entries: + if e[FIELD_CONC] == max_conc_highest_tp: + eval_indices.add(i) + + # Find highest conc for lowest TP (only if different from max_tp) + if min_tp != max_tp: + lowest_tp_entries = [(i, e) for i, e in entries if e[FIELD_TP] == min_tp] + if lowest_tp_entries: + max_conc_lowest_tp = max(e[FIELD_CONC] for _, e in lowest_tp_entries) + for i, e in lowest_tp_entries: + if e[FIELD_CONC] == max_conc_lowest_tp: + eval_indices.add(i) + + # Mark the selected entries + for i, entry in enumerate(matrix_values): + entry[FIELD_RUN_EVAL] = i in eval_indices + + return matrix_values def validate_master_configs_structure(all_config_data): """Validate the structure of all master config entries. @@ -957,6 +1008,9 @@ def main(): else: parser.error(f"Unknown command: {args.command}") + # Choose eval + matrix_values = mark_eval_entries(matrix_values) + # Validate output before printing validate_matrix_output(matrix_values) From af2c3855c2762ee9fa2f7f0f96c5333d8263499d Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 29 Nov 2025 17:46:33 +0800 Subject: [PATCH 158/214] h100, tmp eval_out, sweep integration --- .github/workflows/eval-tmpl.yml | 1 + utils/evals/gsm8k.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml index 63c4164f1..04a2eebed 100644 --- a/.github/workflows/eval-tmpl.yml +++ b/.github/workflows/eval-tmpl.yml @@ -70,6 +70,7 @@ env: OSL: 1024 RANDOM_RANGE_RATIO: '0.8' RESULT_FILENAME: results + RUN_EVAL: true jobs: eval: diff --git a/utils/evals/gsm8k.yaml b/utils/evals/gsm8k.yaml index 3d9e5ce3b..ab3113dc2 100644 --- a/utils/evals/gsm8k.yaml +++ b/utils/evals/gsm8k.yaml @@ -33,6 +33,7 @@ filter_list: - name: "strict-match" filter: - function: "regex" + group_select: -1 regex_pattern: "#### (\\-?[0-9\\.\\,]+)" - function: "take_first" - name: "flexible-extract" From 5e1d68d711895add73aba0b2a1d70429e413ec83 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 29 Nov 2025 19:18:06 +0800 Subject: [PATCH 159/214] touch up sweep naming, remove funny triton error --- .github/workflows/benchmark-tmpl.yml | 2 +- .github/workflows/eval-tmpl.yml | 13 +++---------- .github/workflows/full-sweep-test.yml | 4 ++++ benchmarks/benchmark_lib.sh | 4 +++- 4 files changed, 11 insertions(+), 12 deletions(-) diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 436b156a8..23ea7d518 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -74,7 +74,7 @@ jobs: benchmark: runs-on: ${{ inputs.runner }} timeout-minutes: 180 - name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.framework }} ${{ inputs.precision }} tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} conc=${{ inputs.conc }}' + name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.framework }} ${{ inputs.precision }} tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} conc=${{ inputs.conc }} eval=${{ inputs.run-eval }}' steps: - name: Resource cleanup run: | diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml index 04a2eebed..e4e65a581 100644 --- a/.github/workflows/eval-tmpl.yml +++ b/.github/workflows/eval-tmpl.yml @@ -63,7 +63,8 @@ env: EVAL_TASK: ${{ inputs['eval-task'] }} NUM_FEWSHOT: ${{ inputs['num-fewshot'] }} LIMIT: ${{ inputs.limit }} - EVAL_RESULT_DIR: eval_out + # Keep eval outputs only under /tmp + EVAL_RESULT_DIR: /tmp/eval_out CONC: '32' MAX_MODEL_LEN: '4096' ISL: 1024 @@ -148,12 +149,4 @@ jobs: run: | bash ./runners/launch_${RUNNER_NAME%%_*}.sh - - name: Upload eval artifacts - if: always() - uses: actions/upload-artifact@v5 - with: - name: eval_${{ env.EXP_NAME }}_${{ runner.name }} - path: | - ${{ env.EVAL_RESULT_DIR }}/ - ${{ env.EVAL_RESULT_DIR }}/* - ${{ env.EVAL_RESULT_DIR }}/** \ No newline at end of file + # Intentionally no eval artifact uploads: eval outputs remain in /tmp only. diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml index 01bae1f5d..ed6132450 100644 --- a/.github/workflows/full-sweep-test.yml +++ b/.github/workflows/full-sweep-test.yml @@ -175,6 +175,7 @@ jobs: with: exp-name: "gptoss_1k1k" + # DSR1 8K1K Benchmarks benchmark-dsr1-8k1k: needs: get-configs @@ -245,6 +246,7 @@ jobs: with: exp-name: "gptoss_8k1k" + # DSR1 1K8K Benchmarks benchmark-dsr1-1k8k: needs: get-configs @@ -374,6 +376,7 @@ jobs: with: exp-name: "dsr1_1k8k" + # GPTOSS 1K8K Benchmarks benchmark-gptoss-1k8k: needs: get-configs @@ -409,6 +412,7 @@ jobs: with: exp-name: "gptoss_1k8k" + calc-success-rate: needs: [ diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 5f2b51ec7..05addc161 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -262,7 +262,7 @@ append_lm_eval_summary() { local summary_md="${out_dir}/SUMMARY.md" mkdir -p "$out_dir" || true - python3 utils/lm_eval_to_md.py \ + PYTHONNOUSERSITE=1 PYTHONPATH="" python3 -S utils/lm_eval_to_md.py \ --results-dir "$out_dir" \ --task "${task}" \ --framework "${FRAMEWORK}" \ @@ -281,6 +281,8 @@ append_lm_eval_summary() { fi fi + # Note: Per policy, eval outputs stay under /tmp only; do not copy to workspace. + echo "Results saved to: ${summary_md}" } From 1a3262f574d3908ee89889916c185a9edbe96e10 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 29 Nov 2025 23:14:07 +0800 Subject: [PATCH 160/214] touch up sweep summary --- benchmarks/benchmark_lib.sh | 26 +++++++++++++++++++++++++- runners/launch_b200-nb.sh | 10 ---------- runners/launch_b200-nv.sh | 10 ---------- runners/launch_h100-cw.sh | 10 ---------- runners/launch_h200-cw.sh | 10 ---------- runners/launch_h200-nb.sh | 10 ---------- runners/launch_h200-nv.sh | 10 ---------- runners/launch_mi325x-amd.sh | 10 ---------- runners/launch_mi325x-tw.sh | 10 ---------- 9 files changed, 25 insertions(+), 81 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 05addc161..a2059db26 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -262,6 +262,22 @@ append_lm_eval_summary() { local summary_md="${out_dir}/SUMMARY.md" mkdir -p "$out_dir" || true + # Write minimal meta for collectors that expect it + local meta_json="${out_dir}/meta_env.json" + local model_name="${MODEL_NAME:-$MODEL}" + local dp_json="false" + if [ "${DP_ATTENTION}" = "true" ]; then dp_json="true"; fi + cat > "${meta_json}" <> "${GITHUB_STEP_SUMMARY}" || true - fi - fi -fi diff --git a/runners/launch_b200-nv.sh b/runners/launch_b200-nv.sh index 8a1afff8e..243e624f9 100644 --- a/runners/launch_b200-nv.sh +++ b/runners/launch_b200-nv.sh @@ -23,13 +23,3 @@ srun --jobid=$JOB_ID \ bash benchmarks/${MODEL_CODE}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_slurm.sh scancel $JOB_ID - -# Append eval summary within this same step when available -if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then - GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")" - if [ -d "${GH_SUM_DIR}" ]; then - if [ -f "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" ]; then - cat "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" >> "${GITHUB_STEP_SUMMARY}" || true - fi - fi -fi diff --git a/runners/launch_h100-cw.sh b/runners/launch_h100-cw.sh index 864dc9c95..0179bdd57 100644 --- a/runners/launch_h100-cw.sh +++ b/runners/launch_h100-cw.sh @@ -18,13 +18,3 @@ srun --jobid=$JOB_ID \ bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_h100_slurm.sh scancel $JOB_ID - -# Append eval summary within this same step when available -if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then - GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")" - if [ -d "${GH_SUM_DIR}" ]; then - if [ -f "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" ]; then - cat "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" >> "${GITHUB_STEP_SUMMARY}" || true - fi - fi -fi diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh index 431e027f2..dd4937606 100644 --- a/runners/launch_h200-cw.sh +++ b/runners/launch_h200-cw.sh @@ -30,13 +30,3 @@ srun --jobid=$JOB_ID \ bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}_slurm.sh scancel $JOB_ID - -# Append eval summary within this same step when available -if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then - GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")" - if [ -d "${GH_SUM_DIR}" ]; then - if [ -f "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" ]; then - cat "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" >> "${GITHUB_STEP_SUMMARY}" || true - fi - fi -fi diff --git a/runners/launch_h200-nb.sh b/runners/launch_h200-nb.sh index 19d6e82ba..c76b366d2 100644 --- a/runners/launch_h200-nb.sh +++ b/runners/launch_h200-nb.sh @@ -30,13 +30,3 @@ srun --jobid=$JOB_ID \ bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}_slurm.sh scancel $JOB_ID - -# Append eval summary within this same step when available -if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then - GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")" - if [ -d "${GH_SUM_DIR}" ]; then - if [ -f "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" ]; then - cat "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" >> "${GITHUB_STEP_SUMMARY}" || true - fi - fi -fi diff --git a/runners/launch_h200-nv.sh b/runners/launch_h200-nv.sh index ca2ea6079..5319f8959 100644 --- a/runners/launch_h200-nv.sh +++ b/runners/launch_h200-nv.sh @@ -23,13 +23,3 @@ srun --jobid=$JOB_ID \ bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}_slurm.sh scancel $JOB_ID - -# Append eval summary within this same step when available -if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then - GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")" - if [ -d "${GH_SUM_DIR}" ]; then - if [ -f "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" ]; then - cat "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" >> "${GITHUB_STEP_SUMMARY}" || true - fi - fi -fi diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh index 68affc9a1..eb5f8e00c 100644 --- a/runners/launch_mi325x-amd.sh +++ b/runners/launch_mi325x-amd.sh @@ -23,13 +23,3 @@ srun --jobid=$JOB_ID \ bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_mi325x_slurm.sh scancel $JOB_ID - -# Append eval summary within this same step when available -if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then - GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")" - if [ -d "${GH_SUM_DIR}" ]; then - if [ -f "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" ]; then - cat "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" >> "${GITHUB_STEP_SUMMARY}" || true - fi - fi -fi diff --git a/runners/launch_mi325x-tw.sh b/runners/launch_mi325x-tw.sh index aa87a424d..ed6ff288e 100644 --- a/runners/launch_mi325x-tw.sh +++ b/runners/launch_mi325x-tw.sh @@ -23,13 +23,3 @@ srun --jobid=$JOB_ID \ bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_mi325x_slurm.sh scancel $JOB_ID - -# Fallback: append summary after job completes if container couldn't write directly -if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then - GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")" - if [ -d "${GH_SUM_DIR}" ]; then - if [ -f "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" ]; then - cat "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" >> "${GITHUB_STEP_SUMMARY}" || true - fi - fi -fi From 733d7ca50936e348b92cfc975fe099c184ea6bd7 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 30 Nov 2025 00:51:00 +0800 Subject: [PATCH 161/214] touch up run name --- .github/workflows/benchmark-tmpl.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 23ea7d518..424d67b26 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -74,7 +74,7 @@ jobs: benchmark: runs-on: ${{ inputs.runner }} timeout-minutes: 180 - name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.framework }} ${{ inputs.precision }} tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} conc=${{ inputs.conc }} eval=${{ inputs.run-eval }}' + name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.framework }} ${{ inputs.run-eval }} ${{ inputs.precision }} tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} conc=${{ inputs.conc }}' steps: - name: Resource cleanup run: | From 68c1a2db67851c9a94b5d53a2b9ceac766c7b522 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 30 Nov 2025 15:58:27 +0800 Subject: [PATCH 162/214] Missing eval env var docker --- .github/workflows/benchmark-tmpl.yml | 2 +- benchmarks/benchmark_lib.sh | 10 +++++----- benchmarks/dsr1_fp8_h200_trt_slurm.sh | 7 +++++++ runners/launch_b200-tg.sh | 2 +- runners/launch_h100-cr.sh | 2 +- runners/launch_mi300x-amd.sh | 2 +- runners/launch_mi300x-cr.sh | 2 +- runners/launch_mi355x-amd.sh | 2 +- 8 files changed, 18 insertions(+), 11 deletions(-) diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 424d67b26..20d821541 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -74,7 +74,7 @@ jobs: benchmark: runs-on: ${{ inputs.runner }} timeout-minutes: 180 - name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.framework }} ${{ inputs.run-eval }} ${{ inputs.precision }} tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} conc=${{ inputs.conc }}' + ame: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.framework }} ${{ inputs.precision }} ${{ inputs.run-eval && ''eval '' || '''' }}tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} conc=${{ inputs.conc }}' steps: - name: Resource cleanup run: | diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index a2059db26..89581c65a 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -128,18 +128,18 @@ run_benchmark_serving() { _install_lm_eval_deps() { set +x python3 -m pip install -q --no-cache-dir "lm-eval[api]" || true - # Temporary: workaround known harness issue by using main + # Temporary: workaround issue by using main python3 -m pip install -q --no-cache-dir --no-deps \ "git+https://github.com/EleutherAI/lm-evaluation-harness.git@main" || true } # Patch lm-eval filters to be robust to empty strings via sitecustomize -# Patch lm-eval filters to be robust to empty strings via sitecustomize -_patch_lm_eval_filters() { +_patch_lm_eval() { set +x local patch_dir patch_dir="$(mktemp -d)" cat > "$patch_dir/sitecustomize.py" <<'PY' +# --- Patch LocalChatCompletion.parse_generations to handle empty content with reasoning_content --- import re, sys, unicodedata, json from lm_eval.filters import extraction as ex from lm_eval.models.openai_completions import LocalChatCompletion as _LCC @@ -167,7 +167,7 @@ def _le_parse_generations(outputs, **kwargs): # Keep staticmethod semantics _LCC.parse_generations = staticmethod(_le_parse_generations) -# --- Patch TemplateAPI.apply_chat_template to avoid injecting "type": "text" --- +# --- Patch TemplateAPI.apply_chat_template to avoid injecting "type": "text" for TRT --- try: from lm_eval.models import api_models as _api_models _TemplateAPI = _api_models.TemplateAPI @@ -234,7 +234,7 @@ run_lm_eval() { done _install_lm_eval_deps - _patch_lm_eval_filters + _patch_lm_eval local openai_server_base="http://0.0.0.0:${port}" local openai_chat_base="${openai_server_base}/v1/chat/completions" diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh index ac6bc167c..be46a1768 100644 --- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh @@ -86,3 +86,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + append_lm_eval_summary +fi +set +x \ No newline at end of file diff --git a/runners/launch_b200-tg.sh b/runners/launch_b200-tg.sh index b82e25276..9709b7a87 100644 --- a/runners/launch_b200-tg.sh +++ b/runners/launch_b200-tg.sh @@ -24,7 +24,7 @@ docker run --rm -d --network host --name $server_name \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \ -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ --e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ +-e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ ${GH_SUM_ENV} ${GH_SUM_MOUNT} \ --entrypoint=/bin/bash \ $(echo "$IMAGE" | sed 's/#/\//') \ diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh index 9815e4884..ee2dab3da 100644 --- a/runners/launch_h100-cr.sh +++ b/runners/launch_h100-cr.sh @@ -20,7 +20,7 @@ docker run --rm --network=host --name=$server_name \ --runtime=nvidia --gpus=all --ipc=host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \ +-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RUN_EVAL -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \ -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ ${GH_SUM_ENV} ${GH_SUM_MOUNT} \ --entrypoint=/bin/bash \ diff --git a/runners/launch_mi300x-amd.sh b/runners/launch_mi300x-amd.sh index 85fa1f8c7..c721c44e9 100644 --- a/runners/launch_mi300x-amd.sh +++ b/runners/launch_mi300x-amd.sh @@ -24,7 +24,7 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \ --e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \ +-e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL \ ${GH_SUM_ENV} ${GH_SUM_MOUNT} \ --entrypoint=/bin/bash \ $IMAGE \ diff --git a/runners/launch_mi300x-cr.sh b/runners/launch_mi300x-cr.sh index 4c9d56e7e..2084fbff0 100644 --- a/runners/launch_mi300x-cr.sh +++ b/runners/launch_mi300x-cr.sh @@ -24,7 +24,7 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \ --e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \ +-e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL \ ${GH_SUM_ENV} ${GH_SUM_MOUNT} \ --entrypoint=/bin/bash \ $IMAGE \ diff --git a/runners/launch_mi355x-amd.sh b/runners/launch_mi355x-amd.sh index b1b11ff95..3ab15800e 100644 --- a/runners/launch_mi355x-amd.sh +++ b/runners/launch_mi355x-amd.sh @@ -46,7 +46,7 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT -e NUM_PROMPTS \ --e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \ +-e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL \ ${GH_SUM_ENV} ${GH_SUM_MOUNT} \ --entrypoint=/bin/bash \ $IMAGE \ From 6cb94a766308ce7008dd68728d673ab40a504a7a Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 30 Nov 2025 16:16:10 +0800 Subject: [PATCH 163/214] Typo --- .github/workflows/benchmark-tmpl.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 20d821541..f33bd8157 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -74,7 +74,7 @@ jobs: benchmark: runs-on: ${{ inputs.runner }} timeout-minutes: 180 - ame: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.framework }} ${{ inputs.precision }} ${{ inputs.run-eval && ''eval '' || '''' }}tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} conc=${{ inputs.conc }}' + name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.framework }} ${{ inputs.precision }} ${{ inputs.run-eval && ''eval '' || '''' }}tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} conc=${{ inputs.conc }}' steps: - name: Resource cleanup run: | From bc472c3b36aef3642b6063e35fce3defae07bb92 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Mon, 1 Dec 2025 00:19:44 +0800 Subject: [PATCH 164/214] Add proper coverage --- utils/matrix-logic/generate_sweep_configs.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py index 1110e3c30..a9afc2bc9 100644 --- a/utils/matrix-logic/generate_sweep_configs.py +++ b/utils/matrix-logic/generate_sweep_configs.py @@ -89,16 +89,24 @@ def validate_matrix_output(matrix_values: List[dict]) -> List[dict]: def mark_eval_entries(matrix_values: List[dict]) -> List[dict]: """Mark entries that should run evaluation. - For each unique (model, runner, isl, osl) combination: + For each unique (model, runner, framework, precision, isl, osl) combination: - Mark highest TP with highest conc - Mark lowest TP with highest conc """ from collections import defaultdict - # Group entries by (model, runner, isl, osl) + # Group entries by (model, runner, framework, precision, isl, osl) + # This ensures we compare within the same configuration, not across different frameworks groups = defaultdict(list) for i, entry in enumerate(matrix_values): - key = (entry[FIELD_MODEL], entry[FIELD_RUNNER], entry[FIELD_ISL], entry[FIELD_OSL]) + key = ( + entry[FIELD_MODEL], + entry[FIELD_RUNNER], + entry[FIELD_FRAMEWORK], + entry[FIELD_PRECISION], + entry[FIELD_ISL], + entry[FIELD_OSL] + ) groups[key].append((i, entry)) # For each group, find highest TP/highest conc and lowest TP/highest conc From 2461447a4f1052de4e1b2d321273fbeb2a18cd28 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Wed, 3 Dec 2025 00:09:41 +0800 Subject: [PATCH 165/214] Add evals --- .github/workflows/benchmark-tmpl.yml | 24 ++- .github/workflows/collect-evals.yml | 45 +++++ .github/workflows/full-sweep-test.yml | 48 ++++++ utils/collect_eval_results.py | 237 ++++++++++++++++++++++++++ 4 files changed, 353 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/collect-evals.yml create mode 100644 utils/collect_eval_results.py diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index d5dc1e1b8..c8d6e2764 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -135,6 +135,9 @@ jobs: env: RUNNER_NAME: ${{ runner.name }} RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_tp${{ env.TP }}_ep${{ env.EP_SIZE }}_dpa_${{ env.DP_ATTENTION }}_conc${{ env.CONC }}_${{ runner.name }} + # Suppress per-job eval markdown from being appended to the step summary. + # We'll publish a single combined eval table in the collection job instead. + GITHUB_STEP_SUMMARY: '' run: | bash ./runners/launch_${RUNNER_NAME%%_*}.sh FOUND_RESULT_FILE= @@ -162,4 +165,23 @@ jobs: uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 with: name: ${{ env.RESULT_FILENAME }} - path: agg_${{ env.RESULT_FILENAME }}.json \ No newline at end of file + path: agg_${{ env.RESULT_FILENAME }}.json + + - name: Upload eval results (if any) + if: ${{ env.RUN_EVAL == 'true' }} + uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 + with: + name: eval_${{ env.EXP_NAME }}_${{ env.RESULT_FILENAME }} + path: eval_out/${{ env.RESULT_FILENAME }} + + - name: Cleanup eval outputs (post-upload) + if: ${{ env.RUN_EVAL == 'true' }} + run: | + if [ -n "${RESULT_FILENAME:-}" ] && [ -e "eval_out/${RESULT_FILENAME}" ]; then + echo "Removing eval dir: eval_out/${RESULT_FILENAME}" + rm -rf --one-file-system "eval_out/${RESULT_FILENAME}" || rm -rf "eval_out/${RESULT_FILENAME}" || true + fi + # Also remove empty parent folder if present + if [ -d "eval_out" ]; then + rmdir eval_out 2>/dev/null || true + fi diff --git a/.github/workflows/collect-evals.yml b/.github/workflows/collect-evals.yml new file mode 100644 index 000000000..6f7858238 --- /dev/null +++ b/.github/workflows/collect-evals.yml @@ -0,0 +1,45 @@ +name: Template - Collect Evals + +on: + workflow_call: + inputs: + exp-name: + required: false + type: string + default: '' + +permissions: + contents: read + +jobs: + collect-evals: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + token: ${{ secrets.REPO_PAT }} + fetch-depth: 0 + + - name: Download eval artifacts + uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0 + with: + path: eval_results/ + pattern: ${{ inputs.exp-name && format('eval_{0}_*', inputs.exp-name) || 'eval_*' }} + + - name: Summarize evals + run: | + echo "## 📋 Eval Summary - ${{ inputs.exp-name || 'all' }}" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + python3 utils/collect_eval_results.py eval_results/ ${{ inputs.exp-name || 'all' }} >> $GITHUB_STEP_SUMMARY + + - name: Upload aggregated evals + uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 + with: + name: eval_results_${{ inputs.exp-name || 'all' }} + path: agg_eval_${{ inputs.exp-name || 'all' }}.json + + - name: Cleanup downloaded eval artifacts + if: ${{ always() }} + run: | + rm -rf eval_results/ || true diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml index 8d96d5ac3..ad6bbb5ac 100644 --- a/.github/workflows/full-sweep-test.yml +++ b/.github/workflows/full-sweep-test.yml @@ -140,6 +140,14 @@ jobs: with: exp-name: "dsr1_1k1k" + collect-dsr1-1k1k-evals: + needs: benchmark-dsr1-1k1k + if: ${{ always() && needs.get-configs.outputs.dsr1-1k1k != '[]' }} + uses: ./.github/workflows/collect-evals.yml + secrets: inherit + with: + exp-name: "dsr1_1k1k" + # GPTOSS 1K1K Benchmarks benchmark-gptoss-1k1k: needs: get-configs @@ -175,6 +183,14 @@ jobs: with: exp-name: "gptoss_1k1k" + collect-gptoss-1k1k-evals: + needs: benchmark-gptoss-1k1k + if: ${{ always() && needs.get-configs.outputs.gptoss-1k1k != '[]' }} + uses: ./.github/workflows/collect-evals.yml + secrets: inherit + with: + exp-name: "gptoss_1k1k" + # DSR1 8K1K Benchmarks benchmark-dsr1-8k1k: @@ -211,6 +227,14 @@ jobs: with: exp-name: "dsr1_8k1k" + collect-dsr1-8k1k-evals: + needs: benchmark-dsr1-8k1k + if: ${{ always() && needs.get-configs.outputs.dsr1-8k1k != '[]' }} + uses: ./.github/workflows/collect-evals.yml + secrets: inherit + with: + exp-name: "dsr1_8k1k" + # GPTOSS 8K1K Benchmarks benchmark-gptoss-8k1k: needs: get-configs @@ -246,6 +270,14 @@ jobs: with: exp-name: "gptoss_8k1k" + collect-gptoss-8k1k-evals: + needs: benchmark-gptoss-8k1k + if: ${{ always() && needs.get-configs.outputs.gptoss-8k1k != '[]' }} + uses: ./.github/workflows/collect-evals.yml + secrets: inherit + with: + exp-name: "gptoss_8k1k" + # DSR1 1K8K Benchmarks benchmark-dsr1-1k8k: @@ -376,6 +408,14 @@ jobs: with: exp-name: "dsr1_1k8k" + collect-dsr1-1k8k-evals: + needs: benchmark-dsr1-1k8k + if: ${{ always() && needs.get-configs.outputs.dsr1-1k8k != '[]' }} + uses: ./.github/workflows/collect-evals.yml + secrets: inherit + with: + exp-name: "dsr1_1k8k" + # GPTOSS 1K8K Benchmarks benchmark-gptoss-1k8k: @@ -412,6 +452,14 @@ jobs: with: exp-name: "gptoss_1k8k" + collect-gptoss-1k8k-evals: + needs: benchmark-gptoss-1k8k + if: ${{ always() && needs.get-configs.outputs.gptoss-1k8k != '[]' }} + uses: ./.github/workflows/collect-evals.yml + secrets: inherit + with: + exp-name: "gptoss_1k8k" + calc-success-rate: needs: diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py new file mode 100644 index 000000000..de2af26e4 --- /dev/null +++ b/utils/collect_eval_results.py @@ -0,0 +1,237 @@ +#!/usr/bin/env python3 +import os +import sys +import json +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple + + +def find_eval_sets(root: Path) -> List[Path]: + """Return directories that contain a meta_env.json (one set per job).""" + out: List[Path] = [] + for p in root.rglob('meta_env.json'): + out.append(p.parent) + return out + + +def load_json(path: Path) -> Optional[Dict[str, Any]]: + try: + with open(path, 'r') as f: + return json.load(f) + except Exception: + return None + + +def detect_eval_jsons(d: Path) -> Tuple[Optional[Path], Optional[Path]]: + """Return (lm_eval_json, lighteval_json) if present (latest by mtime).""" + lm: List[Tuple[float, Path]] = [] + le: List[Tuple[float, Path]] = [] + for p in d.rglob('*.json'): + if p.name == 'meta_env.json': + continue + data = load_json(p) + if not isinstance(data, dict): + continue + # Heuristics similar to utils/lm_eval_to_md.py + if 'lm_eval_version' in data or 'pretty_env_info' in data: + try: + lm.append((p.stat().st_mtime, p)) + except Exception: + lm.append((0, p)) + elif 'config_general' in data and 'results' in data: + try: + le.append((p.stat().st_mtime, p)) + except Exception: + le.append((0, p)) + elif 'results' in data: + # Fallback: treat as lm-eval JSON + try: + lm.append((p.stat().st_mtime, p)) + except Exception: + lm.append((0, p)) + lm_path = sorted(lm, key=lambda x: x[0])[-1][1] if lm else None + le_path = sorted(le, key=lambda x: x[0])[-1][1] if le else None + return lm_path, le_path + + +def parse_pretty_env(pretty_env: str) -> str: + try: + lines = [l for l in pretty_env.splitlines() if l.startswith('GPU ')] + names = [l.split(':', 1)[1].strip() for l in lines] + if not names: + return 'Unknown GPU' + # Compress identical names (roughly) + from collections import Counter + c = Counter(names) + return ' + '.join([f"{n}× {name}" for name, n in c.items()]) + except Exception: + return 'Unknown GPU' + + +def extract_lm_metrics(json_path: Path, task: Optional[str] = None) -> Dict[str, Any]: + data = load_json(json_path) or {} + results = data.get('results') or {} + # Pick task + t = task + if not t: + if isinstance(results, dict) and results: + t = next(iter(results.keys())) + else: + t = 'unknown' + + res = results.get(t, {}) if isinstance(results, dict) else {} + strict = res.get('exact_match,strict-match') + flex = res.get('exact_match,flexible-extract') + strict_se = res.get('exact_match_stderr,strict-match') + flex_se = res.get('exact_match_stderr,flexible-extract') + + n_eff = None + ns = data.get('n-samples') or data.get('n_samples') or {} + if isinstance(ns, dict): + td = ns.get(t) or {} + if isinstance(td, dict): + n_eff = td.get('effective') or td.get('n_eff') + + hardware = 'Unknown GPU' + pe = data.get('pretty_env_info') + if isinstance(pe, str) and pe: + hardware = parse_pretty_env(pe) + + model = ( + data.get('model_name') + or (data.get('configs', {}).get(t, {}) or {}).get('metadata', {}).get('model') + or (data.get('config') or {}).get('model') + or '' + ) + + return { + 'task': t, + 'strict': strict, + 'flex': flex, + 'strict_se': strict_se, + 'flex_se': flex_se, + 'n_eff': n_eff, + 'hardware': hardware, + 'model': model, + 'source': str(json_path) + } + + +def extract_lighteval_metrics(json_path: Path, task_base: Optional[str] = None) -> Dict[str, Any]: + data = load_json(json_path) or {} + results = data.get('results', {}) or {} + # Choose a task key starting with task_base if provided, else 'all', else first key + key = None + if task_base: + for k in results.keys(): + if str(k).startswith(task_base): + key = k + break + if key is None: + key = 'all' if 'all' in results else (next(iter(results.keys())) if results else 'unknown') + r = results.get(key, {}) if isinstance(results, dict) else {} + em = r.get('extractive_match') + em_se = r.get('extractive_match_stderr') + + model = '' + cg = data.get('config_general', {}) or {} + model = cg.get('model_name') or cg.get('model_config', {}).get('model_name', '') + + return { + 'task': key, + 'strict': em, + 'flex': None, + 'strict_se': em_se, + 'flex_se': None, + 'n_eff': None, + 'hardware': 'Unknown GPU', + 'model': model, + 'source': str(json_path) + } + + +def pct(x: Any) -> str: + try: + return f"{float(x)*100:.2f}%" + except Exception: + return 'N/A' + + +def se(x: Any) -> str: + try: + return f" ±{float(x)*100:.2f}%" + except Exception: + return '' + + +def main(): + if len(sys.argv) < 3: + print('Usage: collect_eval_results.py ') + sys.exit(1) + + root = Path(sys.argv[1]) + exp_name = sys.argv[2] or 'all' + + rows: List[Dict[str, Any]] = [] + for d in find_eval_sets(root): + meta = load_json(d / 'meta_env.json') or {} + lm_path, le_path = detect_eval_jsons(d) + # Prefer lm-eval when available, else lighteval + if lm_path: + m = extract_lm_metrics(lm_path) + elif le_path: + m = extract_lighteval_metrics(le_path) + else: + continue + + # Merge with meta + row = { + 'model': m.get('model') or meta.get('model') or 'unknown', + 'hw': m.get('hardware', 'Unknown GPU'), + 'framework': (meta.get('framework') or 'unknown').lower(), + 'precision': (meta.get('precision') or 'unknown').lower(), + 'tp': int(meta.get('tp') or 1), + 'ep': int(meta.get('ep') or 1), + 'dp_attention': str(meta.get('dp_attention') or 'false'), + 'task': m.get('task') or 'unknown', + 'em_strict': m.get('strict'), + 'em_strict_se': m.get('strict_se'), + 'em_flexible': m.get('flex'), + 'em_flexible_se': m.get('flex_se'), + 'n_eff': m.get('n_eff'), + 'source': m.get('source'), + } + rows.append(row) + + # Sort for stable output + rows.sort(key=lambda r: (r.get('model',''), r.get('hw',''), r.get('framework',''), r.get('precision',''), r.get('tp',0), r.get('ep',0))) + + if not rows: + print('> No eval results found to summarize.') + else: + # Print Markdown summary table + print('| Model | Hardware | Framework | Precision | TP | EP | DPA | Task | EM Strict | EM Flexible | N (eff) |') + print('| :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |') + for r in rows: + print( + f"| {r['model']} " + f"| {r['hw']} " + f"| {r['framework'].upper()} " + f"| {r['precision'].upper()} " + f"| {r['tp']} " + f"| {r['ep']} " + f"| {r['dp_attention']} " + f"| {r['task']} " + f"| {pct(r['em_strict'])}{se(r['em_strict_se'])} " + f"| {pct(r['em_flexible'])}{se(r['em_flexible_se'])} " + f"| {r['n_eff'] or ''} |" + ) + + # Write JSON aggregate + out_path = Path(f'agg_eval_{exp_name}.json') + with open(out_path, 'w') as f: + json.dump(rows, f, indent=2) + + +if __name__ == '__main__': + main() From 710d4280a0ac0527d5765a6e6da6660235e68710 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Wed, 3 Dec 2025 01:12:29 +0800 Subject: [PATCH 166/214] Cam's solution --- .github/workflows/benchmark-tmpl.yml | 17 +++-- benchmarks/benchmark_lib.sh | 28 ++++++++- utils/collect_eval_results.py | 92 ++++++++++++++++++++-------- 3 files changed, 98 insertions(+), 39 deletions(-) diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index c8d6e2764..559f24fc9 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -172,16 +172,15 @@ jobs: uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 with: name: eval_${{ env.EXP_NAME }}_${{ env.RESULT_FILENAME }} - path: eval_out/${{ env.RESULT_FILENAME }} + path: | + SUMMARY.md + meta_env.json + results*.json + if-no-files-found: ignore - name: Cleanup eval outputs (post-upload) if: ${{ env.RUN_EVAL == 'true' }} run: | - if [ -n "${RESULT_FILENAME:-}" ] && [ -e "eval_out/${RESULT_FILENAME}" ]; then - echo "Removing eval dir: eval_out/${RESULT_FILENAME}" - rm -rf --one-file-system "eval_out/${RESULT_FILENAME}" || rm -rf "eval_out/${RESULT_FILENAME}" || true - fi - # Also remove empty parent folder if present - if [ -d "eval_out" ]; then - rmdir eval_out 2>/dev/null || true - fi + rm -f SUMMARY.md meta_env.json || true + # Remove any eval results JSONs that were moved into workspace + rm -f results*.json || true diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 8fcf9a707..ad75fc9bd 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -326,9 +326,28 @@ META fi fi - # Note: Per policy, eval outputs stay under /tmp only; do not copy to workspace. + # Move eval artifacts into PWD (no new directories in workspace) + if [ -f "${summary_md}" ]; then + mv -f "${summary_md}" ./ || true + fi + if [ -f "${meta_json}" ]; then + mv -f "${meta_json}" ./ || true + fi + if [ -d "${out_dir}" ]; then + while IFS= read -r -d '' jf; do + base=$(basename "$jf") + if [ "$base" != "meta_env.json" ] && [ "$base" != "SUMMARY.md" ]; then + mv -f "$jf" ./ || true + fi + done < <(find "${out_dir}" -type f -name "*.json" -print0 2>/dev/null) + fi - echo "Results saved to: ${summary_md}" + # Best-effort cleanup of the temp directory + if [ -n "${out_dir}" ] && [ -d "${out_dir}" ]; then + rm -rf --one-file-system "${out_dir}" || rm -rf "${out_dir}" || true + fi + + echo "Moved eval artifacts to: $(pwd)" } # ------------------------------ @@ -565,7 +584,7 @@ run_lighteval_eval() { local port="${PORT:-8888}" local task="${EVAL_TASK:-gsm8k}" local num_fewshot="${NUM_FEWSHOT:-5}" - local results_dir="${EVAL_RESULT_DIR:-eval_out_lighteval}" + local results_dir="${EVAL_RESULT_DIR:-$(mktemp -d /tmp/eval_out-XXXXXX)}" local max_samples=0 local concurrent_requests=32 @@ -611,6 +630,9 @@ run_lighteval_eval() { output_dir="/workspace/${results_dir}" fi + # Make output dir visible to append_lm_eval_summary + export EVAL_RESULT_DIR="$output_dir" + set -x lighteval endpoint litellm \ "${MODEL_ARGS}" \ diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py index de2af26e4..51b2a71b6 100644 --- a/utils/collect_eval_results.py +++ b/utils/collect_eval_results.py @@ -7,8 +7,29 @@ def find_eval_sets(root: Path) -> List[Path]: - """Return directories that contain a meta_env.json (one set per job).""" + """Return directories that contain a meta_env.json (one set per job). + + New structure: each downloaded artifact is placed under + eval_results// with flat files inside, e.g.: + - meta_env.json + - SUMMARY.md + - results_*.json + + We first check immediate child directories for meta_env.json to avoid + descending unnecessarily. If nothing is found (backward compatibility), + fall back to recursive search. + """ out: List[Path] = [] + # Prefer immediate children (one directory per artifact) + try: + for d in root.iterdir(): + if d.is_dir() and (d / 'meta_env.json').exists(): + out.append(d) + except Exception: + pass + if out: + return out + # Fallback: recursive (legacy structure) for p in root.rglob('meta_env.json'): out.append(p.parent) return out @@ -23,32 +44,49 @@ def load_json(path: Path) -> Optional[Dict[str, Any]]: def detect_eval_jsons(d: Path) -> Tuple[Optional[Path], Optional[Path]]: - """Return (lm_eval_json, lighteval_json) if present (latest by mtime).""" - lm: List[Tuple[float, Path]] = [] - le: List[Tuple[float, Path]] = [] - for p in d.rglob('*.json'): - if p.name == 'meta_env.json': - continue - data = load_json(p) - if not isinstance(data, dict): - continue - # Heuristics similar to utils/lm_eval_to_md.py - if 'lm_eval_version' in data or 'pretty_env_info' in data: - try: - lm.append((p.stat().st_mtime, p)) - except Exception: - lm.append((0, p)) - elif 'config_general' in data and 'results' in data: - try: - le.append((p.stat().st_mtime, p)) - except Exception: - le.append((0, p)) - elif 'results' in data: - # Fallback: treat as lm-eval JSON - try: - lm.append((p.stat().st_mtime, p)) - except Exception: - lm.append((0, p)) + """Return (lm_eval_json, lighteval_json) if present (latest by mtime). + + New structure places result JSONs flat in the artifact directory. We + first check only the immediate directory for JSONs, then fall back to + recursive search for backward compatibility. + """ + def scan_jsons(paths: List[Path]) -> Tuple[List[Tuple[float, Path]], List[Tuple[float, Path]]]: + lm: List[Tuple[float, Path]] = [] + le: List[Tuple[float, Path]] = [] + for p in paths: + if p.name == 'meta_env.json': + continue + data = load_json(p) + if not isinstance(data, dict): + continue + # Heuristics similar to utils/lm_eval_to_md.py + if 'lm_eval_version' in data or 'pretty_env_info' in data: + try: + lm.append((p.stat().st_mtime, p)) + except Exception: + lm.append((0, p)) + elif 'config_general' in data and 'results' in data: + try: + le.append((p.stat().st_mtime, p)) + except Exception: + le.append((0, p)) + elif 'results' in data: + # Fallback: treat as lm-eval JSON + try: + lm.append((p.stat().st_mtime, p)) + except Exception: + lm.append((0, p)) + return lm, le + + # 1) Prefer immediate JSONs (flat structure) + immediate_jsons = list(d.glob('results*.json')) + [p for p in d.glob('*.json') if p.name != 'meta_env.json'] + lm, le = scan_jsons(immediate_jsons) + + # 2) If nothing found, fallback to deep scan (legacy) + if not lm and not le: + deep_jsons = list(d.rglob('*.json')) + lm, le = scan_jsons(deep_jsons) + lm_path = sorted(lm, key=lambda x: x[0])[-1][1] if lm else None le_path = sorted(le, key=lambda x: x[0])[-1][1] if le else None return lm_path, le_path From 3c8b9bc792203fb00e246d1cecd3fb12b27e8044 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Wed, 3 Dec 2025 03:31:26 +0800 Subject: [PATCH 167/214] b200 scancel fix --- runners/launch_b200-nb.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/runners/launch_b200-nb.sh b/runners/launch_b200-nb.sh index f9b68c025..9a3dfa909 100644 --- a/runners/launch_b200-nb.sh +++ b/runners/launch_b200-nb.sh @@ -13,5 +13,3 @@ srun --partition=$PARTITION --gres=gpu:$TP --exclusive \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT_OFFSET=${USER: -1} \ bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_slurm.sh - -scancel $JOB_ID From 1390c5230f32c68fbd1b64d4e5e013a2ba12a868 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Wed, 3 Dec 2025 04:08:08 +0800 Subject: [PATCH 168/214] Change to 2 fewshot, forgot eval env var in b200 --- benchmarks/benchmark_lib.sh | 2 +- benchmarks/dsr1_fp4_b200_trt_slurm.sh | 7 +++++++ benchmarks/dsr1_fp8_b200_trt_slurm.sh | 7 +++++++ runners/launch_b200-dgxc.sh | 2 +- runners/launch_b200-nb.sh | 2 +- runners/launch_b200-nvd.sh | 2 +- 6 files changed, 18 insertions(+), 4 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index ad75fc9bd..e68e1b21d 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -242,7 +242,7 @@ PY run_lm_eval() { local port="${PORT:-8888}" local task="${EVAL_TASK:-gsm8k}" - local num_fewshot="${NUM_FEWSHOT:-5}" + local num_fewshot="${NUM_FEWSHOT:-2}" local results_dir="${EVAL_RESULT_DIR:-$(mktemp -d /tmp/eval_out-XXXXXX)}" local gen_max_tokens=4096 local temperature=0 diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh index aa2be7648..f4165b72a 100644 --- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh @@ -116,3 +116,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + append_lm_eval_summary +fi +set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh index 58d24a7ed..c77f5277f 100644 --- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh @@ -86,3 +86,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + append_lm_eval_summary +fi +set +x \ No newline at end of file diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh index 4d8ec0aed..25d09313e 100644 --- a/runners/launch_b200-dgxc.sh +++ b/runners/launch_b200-dgxc.sh @@ -41,7 +41,7 @@ docker run --rm --init --network host --name $server_name \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE -e DP_ATTENTION \ -e NCCL_GRAPH_REGISTER=0 \ -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ --e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e NUM_PROMPTS \ +-e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e NUM_PROMPTS -e RUN_EVAL \ --entrypoint=/bin/bash \ $(echo "$IMAGE" | sed 's/#/\//') \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh" diff --git a/runners/launch_b200-nb.sh b/runners/launch_b200-nb.sh index 9a3dfa909..44392e3aa 100644 --- a/runners/launch_b200-nb.sh +++ b/runners/launch_b200-nb.sh @@ -12,4 +12,4 @@ srun --partition=$PARTITION --gres=gpu:$TP --exclusive \ --no-container-mount-home --container-writable \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT_OFFSET=${USER: -1} \ -bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_slurm.sh +bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_slurm.sh \ No newline at end of file diff --git a/runners/launch_b200-nvd.sh b/runners/launch_b200-nvd.sh index ebfa67458..c6ae289bb 100644 --- a/runners/launch_b200-nvd.sh +++ b/runners/launch_b200-nvd.sh @@ -42,7 +42,7 @@ docker run --rm --init --network host --name $server_name \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE -e DP_ATTENTION \ -e NCCL_GRAPH_REGISTER=0 \ -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ --e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e NUM_PROMPTS \ +-e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e NUM_PROMPTS -e RUN_EVAL \ --entrypoint=/bin/bash \ $(echo "$IMAGE" | sed 's/#/\//') \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh" From 544e6986cd4e948a12aebf53ab9ad6a1837d05c3 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Wed, 3 Dec 2025 16:26:57 +0800 Subject: [PATCH 169/214] Resolve issues --- .github/workflows/benchmark-tmpl.yml | 3 +- .github/workflows/collect-evals.yml | 2 +- .github/workflows/drain-b200-nvd2.yml | 32 -- .../workflows/full-sweep-1k1k-scheduler.yml | 4 +- .../workflows/full-sweep-1k8k-scheduler.yml | 4 +- .../workflows/full-sweep-8k1k-scheduler.yml | 4 +- .github/workflows/full-sweep-test.yml | 12 +- benchmarks/benchmark_lib.sh | 51 ++- benchmarks/gptoss_fp4_b200_trt_slurm.sh | 7 + utils/collect_eval_results.py | 108 ++++++- utils/evals/custom_gsm8k.py | 4 +- utils/evals/gsm8k.yaml | 2 + utils/lm_eval_to_md.py | 304 ------------------ utils/matrix-logic/generate_sweep_configs.py | 10 +- 14 files changed, 158 insertions(+), 389 deletions(-) delete mode 100644 .github/workflows/drain-b200-nvd2.yml delete mode 100644 utils/lm_eval_to_md.py diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 559f24fc9..8a8943628 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -173,7 +173,6 @@ jobs: with: name: eval_${{ env.EXP_NAME }}_${{ env.RESULT_FILENAME }} path: | - SUMMARY.md meta_env.json results*.json if-no-files-found: ignore @@ -181,6 +180,6 @@ jobs: - name: Cleanup eval outputs (post-upload) if: ${{ env.RUN_EVAL == 'true' }} run: | - rm -f SUMMARY.md meta_env.json || true + rm -f meta_env.json || true # Remove any eval results JSONs that were moved into workspace rm -f results*.json || true diff --git a/.github/workflows/collect-evals.yml b/.github/workflows/collect-evals.yml index 6f7858238..c45842ef2 100644 --- a/.github/workflows/collect-evals.yml +++ b/.github/workflows/collect-evals.yml @@ -29,7 +29,7 @@ jobs: - name: Summarize evals run: | - echo "## 📋 Eval Summary - ${{ inputs.exp-name || 'all' }}" >> $GITHUB_STEP_SUMMARY + echo "## Eval Summary - ${{ inputs.exp-name || 'all' }}" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY python3 utils/collect_eval_results.py eval_results/ ${{ inputs.exp-name || 'all' }} >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/drain-b200-nvd2.yml b/.github/workflows/drain-b200-nvd2.yml deleted file mode 100644 index 08646c3b4..000000000 --- a/.github/workflows/drain-b200-nvd2.yml +++ /dev/null @@ -1,32 +0,0 @@ -name: Drain b200-nvd_2 - -on: - push: - paths: - - '.github/workflows/drain-b200-nvd2.yml' - -jobs: - hold: - # Pin specifically to the self-hosted runner label for b200-nvd_2 - runs-on: [self-hosted, b200-nvd_2] - # Hold for 24h by default (override by canceling anytime) - timeout-minutes: 1440 - steps: - - name: Start drain - shell: bash - run: | - set -euo pipefail - echo "Holding runner: $RUNNER_NAME" - echo "Runner OS/Arch: $RUNNER_OS / $RUNNER_ARCH" - echo "Started at: $(date -Iseconds)" - echo "Cancel this workflow run to release the runner." - - - name: Hold indefinitely (until timeout or cancel) - shell: bash - run: | - set -euo pipefail - trap 'echo "Release signal received at $(date -Iseconds)"; exit 0' INT TERM - while true; do - echo "Still holding at $(date -Iseconds)" - sleep 1800 - done diff --git a/.github/workflows/full-sweep-1k1k-scheduler.yml b/.github/workflows/full-sweep-1k1k-scheduler.yml index f97cd093c..71b41949f 100644 --- a/.github/workflows/full-sweep-1k1k-scheduler.yml +++ b/.github/workflows/full-sweep-1k1k-scheduler.yml @@ -17,7 +17,7 @@ jobs: - id: get-dsr1-configs run: | pip install pydantic - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1) + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1 --run-evals) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT get-gptoss-configs: @@ -31,7 +31,7 @@ jobs: - id: get-gptoss-configs run: | pip install pydantic - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss) + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss --run-evals) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT benchmark-dsr1: diff --git a/.github/workflows/full-sweep-1k8k-scheduler.yml b/.github/workflows/full-sweep-1k8k-scheduler.yml index cd8c07c74..9bf67c8da 100644 --- a/.github/workflows/full-sweep-1k8k-scheduler.yml +++ b/.github/workflows/full-sweep-1k8k-scheduler.yml @@ -17,7 +17,7 @@ jobs: - id: get-dsr1-configs run: | pip install pydantic - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1) + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1 --run-evals) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT get-gptoss-configs: @@ -31,7 +31,7 @@ jobs: - id: get-gptoss-configs run: | pip install pydantic - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss) + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss --run-evals) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT benchmark-dsr1: diff --git a/.github/workflows/full-sweep-8k1k-scheduler.yml b/.github/workflows/full-sweep-8k1k-scheduler.yml index 036794eef..cfc676911 100644 --- a/.github/workflows/full-sweep-8k1k-scheduler.yml +++ b/.github/workflows/full-sweep-8k1k-scheduler.yml @@ -17,7 +17,7 @@ jobs: - id: get-dsr1-configs run: | pip install pydantic - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1) + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1 --run-evals) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT get-gptoss-configs: @@ -31,7 +31,7 @@ jobs: - id: get-gptoss-configs run: | pip install pydantic - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss) + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss --run-evals) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT benchmark-dsr1: diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml index ad6bbb5ac..3ba954838 100644 --- a/.github/workflows/full-sweep-test.yml +++ b/.github/workflows/full-sweep-test.yml @@ -63,21 +63,21 @@ jobs: # Generate dsr1 configs (only if we have valid runner types for DSR1) if [ "${{ inputs.run_1k1k }}" = "true" ] && [ -n "$DSR1_RUNNER_TYPES" ]; then - DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml --run-evals) echo "dsr1-1k1k=$DSR1_1K1K" >> $GITHUB_OUTPUT else echo "dsr1-1k1k=[]" >> $GITHUB_OUTPUT fi if [ "${{ inputs.run_1k8k }}" = "true" ] && [ -n "$DSR1_RUNNER_TYPES" ]; then - DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml --run-evals) echo "dsr1-1k8k=$DSR1_1K8K" >> $GITHUB_OUTPUT else echo "dsr1-1k8k=[]" >> $GITHUB_OUTPUT fi if [ "${{ inputs.run_8k1k }}" = "true" ] && [ -n "$DSR1_RUNNER_TYPES" ]; then - DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml --run-evals) echo "dsr1-8k1k=$DSR1_8K1K" >> $GITHUB_OUTPUT else echo "dsr1-8k1k=[]" >> $GITHUB_OUTPUT @@ -85,21 +85,21 @@ jobs: # Generate gptoss configs (only if we have runner types selected) if [ "${{ inputs.run_1k1k }}" = "true" ] && [ -n "$RUNNER_TYPES" ]; then - GPTOSS_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + GPTOSS_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml --run-evals) echo "gptoss-1k1k=$GPTOSS_1K1K" >> $GITHUB_OUTPUT else echo "gptoss-1k1k=[]" >> $GITHUB_OUTPUT fi if [ "${{ inputs.run_1k8k }}" = "true" ] && [ -n "$RUNNER_TYPES" ]; then - GPTOSS_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + GPTOSS_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml --run-evals) echo "gptoss-1k8k=$GPTOSS_1K8K" >> $GITHUB_OUTPUT else echo "gptoss-1k8k=[]" >> $GITHUB_OUTPUT fi if [ "${{ inputs.run_8k1k }}" = "true" ] && [ -n "$RUNNER_TYPES" ]; then - GPTOSS_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + GPTOSS_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml --run-evals) echo "gptoss-8k1k=$GPTOSS_8K1K" >> $GITHUB_OUTPUT else echo "gptoss-8k1k=[]" >> $GITHUB_OUTPUT diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index e68e1b21d..141f66c5a 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -288,7 +288,6 @@ append_lm_eval_summary() { local results_dir="${EVAL_RESULT_DIR}" local task="${EVAL_TASK:-gsm8k}" local out_dir="${results_dir}" - local summary_md="${out_dir}/SUMMARY.md" mkdir -p "$out_dir" || true # Write minimal meta for collectors that expect it @@ -296,10 +295,32 @@ append_lm_eval_summary() { local model_name="${MODEL_NAME:-$MODEL}" local dp_json="false" if [ "${DP_ATTENTION}" = "true" ]; then dp_json="true"; fi + + # Derive framework/precision from env, fallback to parsing RESULT_FILENAME + # RESULT_FILENAME format (from workflow): + # ___tp<...>_ep<...>_dpa_<...>_conc<...>_ + local fw="${FRAMEWORK:-}" + local prec="${PRECISION:-}" + if [[ -z "$fw" || -z "$prec" ]]; then + if [[ -n "${RESULT_FILENAME}" ]]; then + # Extract the two fields immediately before "_tp" + # Handles arbitrary underscores in exp_name by matching from the end + local parsed + parsed=$(echo "${RESULT_FILENAME}" | sed -n 's/.*_\([^_][^_]*\)_\([^_][^_]*\)_tp.*/\1 \2/p') + local p1="${parsed%% *}" + local p2="${parsed#* }" + if [[ -z "$prec" && -n "$p1" && "$p1" != "$parsed" ]]; then + prec="$p1" + fi + if [[ -z "$fw" && -n "$p2" && "$p2" != "$parsed" ]]; then + fw="$p2" + fi + fi + fi cat > "${meta_json}" < "$summary_md" || true - - # If running inside a GitHub Actions step on this same machine, append there too - if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then - local GH_SUM_DIR - GH_SUM_DIR="$(dirname "$GITHUB_STEP_SUMMARY")" - if [ -d "$GH_SUM_DIR" ] && [ -w "$GH_SUM_DIR" ]; then - cat "$summary_md" >> "$GITHUB_STEP_SUMMARY" || true - fi - fi - # Move eval artifacts into PWD (no new directories in workspace) - if [ -f "${summary_md}" ]; then - mv -f "${summary_md}" ./ || true - fi if [ -f "${meta_json}" ]; then mv -f "${meta_json}" ./ || true fi if [ -d "${out_dir}" ]; then while IFS= read -r -d '' jf; do base=$(basename "$jf") - if [ "$base" != "meta_env.json" ] && [ "$base" != "SUMMARY.md" ]; then + if [ "$base" != "meta_env.json" ]; then mv -f "$jf" ./ || true fi done < <(find "${out_dir}" -type f -name "*.json" -print0 2>/dev/null) diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh index 44e9dbf4c..56a64bb3a 100644 --- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh @@ -94,3 +94,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + append_lm_eval_summary +fi +set +x diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py index 51b2a71b6..0254d640c 100644 --- a/utils/collect_eval_results.py +++ b/utils/collect_eval_results.py @@ -12,7 +12,6 @@ def find_eval_sets(root: Path) -> List[Path]: New structure: each downloaded artifact is placed under eval_results// with flat files inside, e.g.: - meta_env.json - - SUMMARY.md - results_*.json We first check immediate child directories for meta_env.json to avoid @@ -51,6 +50,23 @@ def detect_eval_jsons(d: Path) -> Tuple[Optional[Path], Optional[Path]]: recursive search for backward compatibility. """ def scan_jsons(paths: List[Path]) -> Tuple[List[Tuple[float, Path]], List[Tuple[float, Path]]]: + """Classify JSON files into lm-eval vs lighteval buckets. + + Returns two lists of (mtime, path) where: + - The first list contains candidates that look like lm-eval outputs. + - The second list contains candidates that look like lighteval outputs. + + Heuristics used (order matters): + - If a JSON has keys like 'lm_eval_version' or 'pretty_env_info', + we treat it as an lm-eval result file. + - If it has both 'config_general' and 'results', we treat it as + a lighteval result file. + - If it only has a top-level 'results' but none of the stronger + signals above, we fall back to classifying it as lm-eval. + + We keep the file modification time to later choose the most recent + candidate; if obtaining mtime fails, we fall back to 0. + """ lm: List[Tuple[float, Path]] = [] le: List[Tuple[float, Path]] = [] for p in paths: @@ -59,13 +75,14 @@ def scan_jsons(paths: List[Path]) -> Tuple[List[Tuple[float, Path]], List[Tuple[ data = load_json(p) if not isinstance(data, dict): continue - # Heuristics similar to utils/lm_eval_to_md.py if 'lm_eval_version' in data or 'pretty_env_info' in data: + # lm-eval harness output try: lm.append((p.stat().st_mtime, p)) except Exception: lm.append((0, p)) elif 'config_general' in data and 'results' in data: + # lighteval output structure try: le.append((p.stat().st_mtime, p)) except Exception: @@ -109,19 +126,92 @@ def parse_pretty_env(pretty_env: str) -> str: def extract_lm_metrics(json_path: Path, task: Optional[str] = None) -> Dict[str, Any]: data = load_json(json_path) or {} results = data.get('results') or {} - # Pick task + # Determine task key robustly: + # 1) explicit argument + # 2) only key in `results` + # 3) only key in `configs` + # 4) 'unknown' t = task if not t: - if isinstance(results, dict) and results: + if isinstance(results, dict) and len(results) == 1: t = next(iter(results.keys())) else: - t = 'unknown' + cfgs = data.get('configs') or {} + if isinstance(cfgs, dict) and len(cfgs) == 1: + t = next(iter(cfgs.keys())) + else: + # fallback to arbitrary but stable choice + t = next(iter(results.keys()), 'unknown') if isinstance(results, dict) else 'unknown' res = results.get(t, {}) if isinstance(results, dict) else {} - strict = res.get('exact_match,strict-match') - flex = res.get('exact_match,flexible-extract') - strict_se = res.get('exact_match_stderr,strict-match') - flex_se = res.get('exact_match_stderr,flexible-extract') + + # Determine base metric name (e.g., 'exact_match') + base_metric: Optional[str] = None + hib = (data.get('higher_is_better') or {}).get(t) if isinstance(data.get('higher_is_better'), dict) else None + if isinstance(hib, dict) and hib: + base_metric = next(iter(hib.keys())) + if not base_metric: + cfg = (data.get('configs') or {}).get(t, {}) if isinstance(data.get('configs'), dict) else {} + ml = cfg.get('metric_list') if isinstance(cfg, dict) else None + if isinstance(ml, list) and ml: + m0 = ml[0] or {} + if isinstance(m0, dict): + base_metric = m0.get('metric') + if not base_metric: + # Fallback: infer from result keys + if isinstance(res, dict): + for k in res.keys(): + if isinstance(k, str) and ',' in k: + base_metric = k.split(',', 1)[0] + break + if not base_metric and 'exact_match' in res: + base_metric = 'exact_match' + if not base_metric: + base_metric = 'exact_match' + + # Determine filter names and map to strict/flexible logically without guessing + strict_name: Optional[str] = None + flex_name: Optional[str] = None + cfg = (data.get('configs') or {}).get(t, {}) if isinstance(data.get('configs'), dict) else {} + fl = cfg.get('filter_list') if isinstance(cfg, dict) else None + filter_names: List[str] = [] + if isinstance(fl, list): + for it in fl: + if isinstance(it, dict): + nm = it.get('name') + if isinstance(nm, str): + filter_names.append(nm) + # Prefer semantic names when present; otherwise preserve file order + for nm in filter_names: + if strict_name is None and 'strict' in nm.lower(): + strict_name = nm + if flex_name is None and ('flex' in nm.lower() or 'extract' in nm.lower()): + flex_name = nm + # Fallback to first/second if semantic match not found + if not strict_name and filter_names: + strict_name = filter_names[0] + if not flex_name and len(filter_names) >= 2: + flex_name = filter_names[1] + + # Extract metrics present in results using derived keys + def get_pair(fname: Optional[str]) -> Tuple[Optional[float], Optional[float]]: + if not fname: + # try unfiltered key + v = res.get(base_metric) + se = res.get(f"{base_metric}_stderr") + try: + return float(v) if v is not None else None, float(se) if se is not None else None + except Exception: + return v, se + v = res.get(f"{base_metric},{fname}") + se = res.get(f"{base_metric}_stderr,{fname}") + try: + return float(v) if v is not None else None, float(se) if se is not None else None + except Exception: + return v, se + + strict, strict_se = get_pair(strict_name) + flex, flex_se = get_pair(flex_name) n_eff = None ns = data.get('n-samples') or data.get('n_samples') or {} diff --git a/utils/evals/custom_gsm8k.py b/utils/evals/custom_gsm8k.py index 4449188fa..5445f5732 100644 --- a/utils/evals/custom_gsm8k.py +++ b/utils/evals/custom_gsm8k.py @@ -1,3 +1,5 @@ +# Copied from https://github.com/huggingface/lighteval/blob/99ef5b98d422cf3620eebec9db13285493d35542/src/lighteval/tasks/tasks/gsm8k.py +# Increases generation size to 768 from 256 to better accommodate longer solutions by dsr1. from lighteval.metrics.metrics import Metrics from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.tasks.gsm8k import gsm8k_prompt @@ -11,7 +13,7 @@ evaluation_splits=["test"], few_shots_split=None, few_shots_select="random_sampling_from_train", - generation_size=768, # raise this as needed + generation_size=768, # raised this from 256 metrics=[Metrics.expr_gold_metric], stop_sequence=None, # avoid early stop on "Question:" version=0, diff --git a/utils/evals/gsm8k.yaml b/utils/evals/gsm8k.yaml index ab3113dc2..73a1f7c1e 100644 --- a/utils/evals/gsm8k.yaml +++ b/utils/evals/gsm8k.yaml @@ -1,3 +1,5 @@ +# YAML from https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/gsm8k/gsm8k.yaml +# Changed doc_to_text so model answers properly. Also see lm-evaluation-harness#3411. tag: - math_word_problems task: gsm8k diff --git a/utils/lm_eval_to_md.py b/utils/lm_eval_to_md.py deleted file mode 100644 index 0c59bc494..000000000 --- a/utils/lm_eval_to_md.py +++ /dev/null @@ -1,304 +0,0 @@ -#!/usr/bin/env python3 -""" -Convert latest lm-evaluation-harness and/or lighteval JSONs in a results dir -into Markdown tables for GitHub Actions job summary. Prints to stdout. - -Usage (same as before, works even if FRAMEWORK/PRECISION env vars are empty): - python3 utils/lm_eval_to_md.py \ - --results-dir /workspace/eval_out \ - --task gsm8k \ - --framework vLLM \ - --precision fp16 \ - --tp 4 \ - --ep 1 \ - --dp-attention false -""" -import argparse -import json -import os -import re -import sys -from collections import Counter -from glob import glob -from typing import Optional, Tuple, Dict, Any, List - - -# ----------------------- -# Helpers -# ----------------------- - -def pct(x): - return f"{x*100:.2f}%" if isinstance(x, (int, float)) else "N/A" - - -def se(x): - return f" \u00B1{(x*100):.2f}%" if isinstance(x, (int, float)) else "" - - -def gpu_cpu_from_pretty_env(pe: str): - if not isinstance(pe, str) or not pe: - return "Unknown GPU" - gpu_lines = [l for l in pe.splitlines() if l.startswith("GPU ")] - names = [re.sub(r"GPU \d+:\s*", "", l).strip() for l in gpu_lines] - c = Counter(names) - gpu_summary = " + ".join([f"{n}\u00D7 {name}" for name, n in c.items()]) if c else "Unknown GPU" - cpu_line = next((l.split(":", 1)[1].strip() for l in pe.splitlines() if l.startswith("Model name:")), None) - return gpu_summary + (f" ({cpu_line})" if cpu_line else "") - - -def detect_framework_kind(data: Dict[str, Any]) -> str: - """ - Classify JSON as: - - 'lm-eval' : lm-evaluation-harness style JSON - - 'lighteval' : lighteval JSON - - 'unknown' : anything else - """ - # lm-eval has lm_eval_version + results structure like results["gsm8k"]... [oai_citation:0‡results_2025-11-25T08-30-41.513104.json](sediment://file_000000001658720790705168e4c51783) - if "lm_eval_version" in data or "pretty_env_info" in data: - return "lm-eval" - # lighteval has config_general + config_tasks/results keyed by "|" [oai_citation:1‡results_2025-11-25T08-40-05.199875.json](sediment://file_000000006f3872078dd9c458c614c1f7) - if "config_general" in data and "results" in data: - return "lighteval" - return "unknown" - - -def find_all_jsons(results_dir: str) -> List[str]: - paths = [] - for root, _, _ in os.walk(results_dir): - for name in glob(os.path.join(root, "*.json")): - paths.append(name) - return paths - - -def find_latest_by_kind(results_dir: str) -> Tuple[Optional[str], Optional[str]]: - """ - Scan all JSONs under results_dir and return: - (latest_lm_eval_json_path, latest_lighteval_json_path) - """ - lm_eval_candidates = [] - lighteval_candidates = [] - - for path in find_all_jsons(results_dir): - try: - with open(path, "r") as f: - data = json.load(f) - except Exception: - continue - - kind = detect_framework_kind(data) - mtime = os.path.getmtime(path) - if kind == "lm-eval": - lm_eval_candidates.append((mtime, path)) - elif kind == "lighteval": - lighteval_candidates.append((mtime, path)) - - lm_path = max(lm_eval_candidates, default=(None, None))[1] - le_path = max(lighteval_candidates, default=(None, None))[1] - return lm_path, le_path - - -# ----------------------- -# lm-eval parsing -# ----------------------- - -def extract_lm_eval_metrics(data: Dict[str, Any], task: str) -> Dict[str, Any]: - res_all = data.get("results", {}) or {} - res = res_all.get(task) if isinstance(res_all, dict) else {} - if not res and isinstance(res_all, dict) and res_all: - any_key = next(iter(res_all.keys())) - res = res_all.get(any_key, {}) - task = any_key - - strict = res.get("exact_match,strict-match") - flex = res.get("exact_match,flexible-extract") - strict_se = res.get("exact_match_stderr,strict-match") - flex_se = res.get("exact_match_stderr,flexible-extract") - - n_eff = None - ns = data.get("n-samples") or data.get("n_samples") or {} - if isinstance(ns, dict): - tdict = ns.get(task) or ns.get("gsm8k") or {} - if isinstance(tdict, dict): - n_eff = tdict.get("effective") or tdict.get("n_eff") - - model = data.get("model_name") \ - or data.get("configs", {}).get(task, {}).get("metadata", {}).get("model") \ - or data.get("config", {}).get("model") \ - or "" - - fewshot = None - nshot = data.get("n-shot") or data.get("n_shot") or {} - if isinstance(nshot, dict): - fewshot = nshot.get(task) or nshot.get("gsm8k") - - limit = None - cfg = data.get("config") or {} - if isinstance(cfg, dict): - limit = cfg.get("limit") - - return { - "task": task, - "strict": strict, - "flex": flex, - "strict_se": strict_se, - "flex_se": flex_se, - "n_eff": n_eff, - "model": model, - "fewshot": fewshot, - "limit": limit, - } - - -def render_lm_eval_section(path: str, - args, - framework_label: str, - precision_label: str) -> Tuple[str, Dict[str, Any]]: - with open(path, "r") as f: - data = json.load(f) - - hardware = gpu_cpu_from_pretty_env(data.get("pretty_env_info", "")) - m = extract_lm_eval_metrics(data, args.task) - - print(f"### {args.task} Evaluation (lm-eval-harness)\n") - print("| Hardware | Framework | Precision | TP | EP | DP Attention | EM Strict | EM Flexible | N (eff) |") - print("|---|---|---:|--:|--:|:--:|--:|--:|--:|") - print( - f"| {hardware} | {framework_label} | {precision_label} | {args.tp} | {args.ep} | " - f"{str(args.dp_attention).lower()} | " - f"{pct(m['strict'])}{se(m['strict_se'])} | " - f"{pct(m['flex'])}{se(m['flex_se'])} | {m['n_eff'] or ''} |" - ) - - lim = m["limit"] - lim_str = str(int(lim)) if isinstance(lim, (int, float)) else (str(lim) if lim is not None else "") - fewshot = m["fewshot"] if m["fewshot"] is not None else "" - print( - f"\n_Model_: `{m['model']}`    " - f"_k-shot_: **{fewshot}**    " - f"_limit_: **{lim_str}** \n" - f"_Source_: `{os.path.basename(path)}`" - ) - return hardware, m - - -# ----------------------- -# lighteval parsing -# ----------------------- - -def extract_lighteval_metrics(data: Dict[str, Any], task_base: str) -> Dict[str, Any]: - res_all = data.get("results", {}) or {} - - # Prefer task-specific key like "gsm8k|5" over "all" [oai_citation:2‡results_2025-11-25T08-40-05.199875.json](sediment://file_000000006f3872078dd9c458c614c1f7) - task_key = None - for k in res_all.keys(): - if k.startswith(task_base): - task_key = k - break - if task_key is None and "all" in res_all: - task_key = "all" - - r = res_all.get(task_key, {}) - em = r.get("extractive_match") - em_se = r.get("extractive_match_stderr") - - # Fewshot & other metadata from config_tasks if available - fewshot = None - cfg_tasks = data.get("config_tasks", {}) - if isinstance(cfg_tasks, dict) and task_key in cfg_tasks: - fewshot = cfg_tasks[task_key].get("num_fewshots") - - # Model name from config_general - cg = data.get("config_general", {}) or {} - model = cg.get("model_name") or cg.get("model_config", {}).get("model_name", "") - - return { - "task": task_key or task_base, - "em": em, - "em_se": em_se, - "fewshot": fewshot, - "model": model, - # lighteval JSON you showed doesn’t expose an obvious effective N; leave blank - "n_eff": None, - } - - -def render_lighteval_section(path: str, - args, - framework_label: str, - precision_label: str, - hardware_fallback: Optional[str]) -> None: - with open(path, "r") as f: - data = json.load(f) - - m = extract_lighteval_metrics(data, args.task) - hardware = hardware_fallback or "Unknown GPU" - - print(f"### {args.task} Evaluation (lighteval)\n") - print("| Hardware | Framework | Precision | TP | EP | DP Attention | Extractive Match | N (eff) |") - print("|---|---|---:|--:|--:|:--:|--:|--:|") - print( - f"| {hardware} | {framework_label} | {precision_label} | {args.tp} | {args.ep} | " - f"{str(args.dp_attention).lower()} | " - f"{pct(m['em'])}{se(m['em_se'])} | {m['n_eff'] or ''} |" - ) - - fewshot = m["fewshot"] if m["fewshot"] is not None else "" - print( - f"\n_Model_: `{m['model']}`    " - f"_k-shot_: **{fewshot}** \n" - f"_Source_: `{os.path.basename(path)}`" - ) - - -# ----------------------- -# main -# ----------------------- - -def main(): - ap = argparse.ArgumentParser() - ap.add_argument("--results-dir", required=True) - ap.add_argument("--task", default="gsm8k") - ap.add_argument("--framework", default=os.environ.get("FRAMEWORK", "")) - ap.add_argument("--precision", default=os.environ.get("PRECISION", "")) - ap.add_argument("--tp", default=os.environ.get("TP", "1")) - ap.add_argument("--ep", default=os.environ.get("EP_SIZE", "1")) - ap.add_argument("--dp-attention", default=os.environ.get("DP_ATTENTION", "false")) - args = ap.parse_args() - - # Robust defaults if env vars / CLI args are empty - framework_label = args.framework or os.environ.get("FRAMEWORK") or "unknown" - precision_label = args.precision or os.environ.get("PRECISION") or "unknown" - - lm_path, le_path = find_latest_by_kind(args.results_dir) - - if not lm_path and not le_path: - print(f"### {args.task} Evaluation\n") - print(f"> No result JSON found in `{args.results_dir}`.") - return - - hardware_from_lm = None - - # 1) lm-eval section (if present) - if lm_path: - hardware_from_lm, _ = render_lm_eval_section( - lm_path, args, framework_label, precision_label - ) - - # Spacer between sections if both exist - if lm_path and le_path: - print("\n") - - # 2) lighteval section (if present) - if le_path: - render_lighteval_section( - le_path, args, framework_label, precision_label, hardware_from_lm - ) - - -if __name__ == "__main__": - try: - main() - except Exception as e: - # Never blow up the CI summary; emit a helpful line instead. - print(f"> Failed to render evaluation summary: {e}") - sys.exit(0) \ No newline at end of file diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py index a9afc2bc9..a039a52f1 100644 --- a/utils/matrix-logic/generate_sweep_configs.py +++ b/utils/matrix-logic/generate_sweep_configs.py @@ -761,6 +761,11 @@ def main(): required=True, help='One or more configuration files (YAML format)' ) + parent_parser.add_argument( + '--run-evals', + action='store_true', + help='Opt-in flag to mark a subset of generated configs to run evals. When omitted, no evals run.' + ) # Create main parser parser = argparse.ArgumentParser( @@ -1016,8 +1021,9 @@ def main(): else: parser.error(f"Unknown command: {args.command}") - # Choose eval - matrix_values = mark_eval_entries(matrix_values) + # Choose eval (opt-in via --run-evals) + if args.run_evals: + matrix_values = mark_eval_entries(matrix_values) # Validate output before printing validate_matrix_output(matrix_values) From 5ec3378de2a87629a385212f77a1eea8eaa3f799 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Thu, 4 Dec 2025 13:14:30 +0800 Subject: [PATCH 170/214] Resolve issues/nits --- benchmarks/benchmark_lib.sh | 7 +----- benchmarks/gptoss_fp4_h100_slurm.sh | 2 +- runners/launch_h100-cr.sh | 1 - runners/launch_mi300x-amd.sh | 1 - runners/launch_mi300x-cr.sh | 1 - runners/launch_mi325x-amd.sh | 1 - runners/launch_mi325x-tw.sh | 1 - runners/launch_mi355x-amd.sh | 1 - utils/collect_eval_results.py | 10 +++++++- utils/evals/READMEevals.md | 28 ++++++++++++++++++++++ utils/evals/math500.yaml | 36 +++++++++++++++++++++++++++++ 11 files changed, 75 insertions(+), 14 deletions(-) create mode 100644 utils/evals/READMEevals.md create mode 100644 utils/evals/math500.yaml diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 141f66c5a..2e28828a0 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -162,7 +162,7 @@ _install_lm_eval_deps() { python3 -m pip install -q --no-cache-dir "lm-eval[api]" || true # Temporary: workaround issue by using main python3 -m pip install -q --no-cache-dir --no-deps \ - "git+https://github.com/EleutherAI/lm-evaluation-harness.git@main" || true + "git+https://github.com/EleutherAI/lm-evaluation-harness.git@b315ef3b05176acc9732bb7fdec116abe1ecc476" || true } # Patch lm-eval filters to be robust to empty strings via sitecustomize @@ -450,11 +450,6 @@ def _patched___call_api(self, prompt, return_logits, max_new_tokens, num_samples if (not content) and reasoning: return response - if not content and LITELLM_CACHE: - logger.info("Empty content with caching on; retrying uncached once") - kwargs["caching"] = False - response = litellm.completion(**kwargs) - return response except litellm.BadRequestError as e: if "message" in e.__dict__ and "policy" in e.__dict__["message"]: diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh index b9f5fb958..a1321934e 100644 --- a/benchmarks/gptoss_fp4_h100_slurm.sh +++ b/benchmarks/gptoss_fp4_h100_slurm.sh @@ -53,7 +53,7 @@ run_benchmark_serving \ --input-len "$ISL" \ --output-len "$OSL" \ --random-range-ratio "$RANDOM_RANGE_RATIO" \ - --num-prompts $(( $CONC * 1 )) \ + --num-prompts $(( $CONC * 10 )) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh index 6b5d3f9d0..1070b6de0 100644 --- a/runners/launch_h100-cr.sh +++ b/runners/launch_h100-cr.sh @@ -12,7 +12,6 @@ docker run --rm --network=host --name=$server_name \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RUN_EVAL -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \ -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ - ${GH_SUM_ENV} ${GH_SUM_MOUNT} \ --entrypoint=/bin/bash \ $IMAGE \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_h100_docker.sh" diff --git a/runners/launch_mi300x-amd.sh b/runners/launch_mi300x-amd.sh index f6fe97881..e5cea1ed6 100644 --- a/runners/launch_mi300x-amd.sh +++ b/runners/launch_mi300x-amd.sh @@ -15,7 +15,6 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \ -e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL \ -${GH_SUM_ENV} ${GH_SUM_MOUNT} \ --entrypoint=/bin/bash \ $IMAGE \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi300x_docker.sh" diff --git a/runners/launch_mi300x-cr.sh b/runners/launch_mi300x-cr.sh index 965ae4222..f864fec25 100644 --- a/runners/launch_mi300x-cr.sh +++ b/runners/launch_mi300x-cr.sh @@ -15,7 +15,6 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \ -e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL \ - ${GH_SUM_ENV} ${GH_SUM_MOUNT} \ --entrypoint=/bin/bash \ $IMAGE \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi300x_docker.sh" diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh index eb5f8e00c..1065167d7 100644 --- a/runners/launch_mi325x-amd.sh +++ b/runners/launch_mi325x-amd.sh @@ -11,7 +11,6 @@ salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=256 --time=180 --no JOB_ID=$(squeue -u $USER -h -o %A | head -n1) srun --jobid=$JOB_ID bash -c "sudo enroot import -o $SQUASH_FILE docker://$IMAGE" - srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ diff --git a/runners/launch_mi325x-tw.sh b/runners/launch_mi325x-tw.sh index ed6ff288e..488ce6ceb 100644 --- a/runners/launch_mi325x-tw.sh +++ b/runners/launch_mi325x-tw.sh @@ -11,7 +11,6 @@ salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=128 --time=180 --no JOB_ID=$(squeue -u $USER -h -o %A | head -n1) srun --jobid=$JOB_ID bash -c "sudo enroot import -o $SQUASH_FILE docker://$IMAGE" - srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ diff --git a/runners/launch_mi355x-amd.sh b/runners/launch_mi355x-amd.sh index a6c64b1ee..f6507388b 100644 --- a/runners/launch_mi355x-amd.sh +++ b/runners/launch_mi355x-amd.sh @@ -37,7 +37,6 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT -e NUM_PROMPTS \ -e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL \ -${GH_SUM_ENV} ${GH_SUM_MOUNT} \ --entrypoint=/bin/bash \ $IMAGE \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi355x_docker.sh" diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py index 0254d640c..84be661ce 100644 --- a/utils/collect_eval_results.py +++ b/utils/collect_eval_results.py @@ -313,9 +313,17 @@ def main(): continue # Merge with meta + # Prefer explicit hardware identifiers from meta (if present) and fall back to parsed pretty_env_info + hw_meta = ( + meta.get('hw') + or meta.get('runner') + or meta.get('RUNNER_TYPE') + or None + ) + hw_value = hw_meta if hw_meta else m.get('hardware', 'Unknown GPU') row = { 'model': m.get('model') or meta.get('model') or 'unknown', - 'hw': m.get('hardware', 'Unknown GPU'), + 'hw': hw_value, 'framework': (meta.get('framework') or 'unknown').lower(), 'precision': (meta.get('precision') or 'unknown').lower(), 'tp': int(meta.get('tp') or 1), diff --git a/utils/evals/READMEevals.md b/utils/evals/READMEevals.md new file mode 100644 index 000000000..511c80804 --- /dev/null +++ b/utils/evals/READMEevals.md @@ -0,0 +1,28 @@ +# Evals + +## What? +Quick graded QnA which measures model performance. Examples of test suites: +- **gsm8k**: Grade school math questions +- **gpqa**: Graduate level, Google-Proof multiple choice questions +- **math500**: Math questions spanning topics like probability, algebra, trigonometry, and geometry. + +## When? +At highest concurrency for highest TP and lowest TP, per GPU per model per ISL/OSL. Logic is defined in `mark_eval_entries` of `utils/matrix-logic/generate_sweep_configs.py` + +## Why? +To verify how model outputs are affected by throughput optimizations. +- TP/Conc might affect model outputs +- Check kernel implementations for correctness + +## How? +- `run_eval`, definined in `benchmarks/benchmark_lib.sh`, is called in `benchmarks/*`. Either EleutherAI/lm-evaluation-harness(lmeval) or lighteval with litellm is ran, using the same endpoint as the throughput benchmark. JSON results are processed and converted to a table with `utils/collect_eval_results.py`. + +## Misc +Following files are task definitions from lmeval, more info on changes within the files +- `utils/evals/math500.yaml` +- `utils/evals/gsm8k.yaml` +Following files are task definitions from lighteval, more info on changes within the files +- `utils/evals/custom_gsm8k.py` + + + diff --git a/utils/evals/math500.yaml b/utils/evals/math500.yaml new file mode 100644 index 000000000..09051d118 --- /dev/null +++ b/utils/evals/math500.yaml @@ -0,0 +1,36 @@ +# YAML from https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/hendrycks_math/hendrycks_math_algebra.yaml +# Changed regex and prompt +tag: + - math_word_problems +task: hendrycks_math_algebra +dataset_path: HuggingFaceH4/MATH-500 +process_docs: !function utils.process_docs +dataset_name: algebra +output_type: generate_until +training_split: train +test_split: test +doc_to_text: "You are solving competition math problems.\n\nFormat rules:\n- Answer in a new. That line must start with `Answer: ` (capital A, colon, one space).\n- After `Answer: `, write ONLY the answer as inline LaTeX.\n- Use ONLY ASCII LaTeX commands (e.g. \\pi, \\frac{1}{2}, -). NO Unicode symbols.\n- Do NOT wrap the answer in $, $$, \\( \\), \\[ \\], or any other delimiters.\n- Do NOT use \\displaystyle or any display-style commands. Answer only this problem, the rest are examples. Problem: {{problem}}\n" +process_results: !function utils.process_results +doc_to_target: "{{answer}}" +generation_kwargs: + until: + - "Problem:" + do_sample: false + temperature: 0 +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + regexes_to_ignore: + - "\\\\left" + - "\\\\right" + - "\\s+" +filter_list: + - name: "strict-match" + filter: + - function: "regex" + group_select: -1 + regex_pattern: "Answer:\\s*([^\\n]+)" + - function: "take_first" +metadata: + version: 1.0 \ No newline at end of file From ae4e481bc7e4ac6c9a269a690bddd580e017daf8 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Thu, 4 Dec 2025 19:48:49 +0800 Subject: [PATCH 171/214] fix summary table hardware --- benchmarks/benchmark_lib.sh | 3 ++- utils/collect_eval_results.py | 31 +++---------------------------- 2 files changed, 5 insertions(+), 29 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 2e28828a0..2e3ad10c4 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -324,7 +324,8 @@ append_lm_eval_summary() { "tp": ${TP:-1}, "ep": ${EP_SIZE:-1}, "dp_attention": ${dp_json}, - "model": "${model_name:-}" + "model": "${model_name:-}", + "hw": "${RUNNER_TYPE:-unknown}" } META diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py index 84be661ce..6bb771b5d 100644 --- a/utils/collect_eval_results.py +++ b/utils/collect_eval_results.py @@ -109,20 +109,6 @@ def scan_jsons(paths: List[Path]) -> Tuple[List[Tuple[float, Path]], List[Tuple[ return lm_path, le_path -def parse_pretty_env(pretty_env: str) -> str: - try: - lines = [l for l in pretty_env.splitlines() if l.startswith('GPU ')] - names = [l.split(':', 1)[1].strip() for l in lines] - if not names: - return 'Unknown GPU' - # Compress identical names (roughly) - from collections import Counter - c = Counter(names) - return ' + '.join([f"{n}× {name}" for name, n in c.items()]) - except Exception: - return 'Unknown GPU' - - def extract_lm_metrics(json_path: Path, task: Optional[str] = None) -> Dict[str, Any]: data = load_json(json_path) or {} results = data.get('results') or {} @@ -220,11 +206,6 @@ def get_pair(fname: Optional[str]) -> Tuple[Optional[float], Optional[float]]: if isinstance(td, dict): n_eff = td.get('effective') or td.get('n_eff') - hardware = 'Unknown GPU' - pe = data.get('pretty_env_info') - if isinstance(pe, str) and pe: - hardware = parse_pretty_env(pe) - model = ( data.get('model_name') or (data.get('configs', {}).get(t, {}) or {}).get('metadata', {}).get('model') @@ -239,7 +220,7 @@ def get_pair(fname: Optional[str]) -> Tuple[Optional[float], Optional[float]]: 'strict_se': strict_se, 'flex_se': flex_se, 'n_eff': n_eff, - 'hardware': hardware, + 'hardware': 'Unknown GPU', 'model': model, 'source': str(json_path) } @@ -313,14 +294,8 @@ def main(): continue # Merge with meta - # Prefer explicit hardware identifiers from meta (if present) and fall back to parsed pretty_env_info - hw_meta = ( - meta.get('hw') - or meta.get('runner') - or meta.get('RUNNER_TYPE') - or None - ) - hw_value = hw_meta if hw_meta else m.get('hardware', 'Unknown GPU') + # Only use explicit hardware label written to meta_env.json ('hw') + hw_value = meta.get('hw', 'Unknown GPU') row = { 'model': m.get('model') or meta.get('model') or 'unknown', 'hw': hw_value, From 48a220d527da89d353c70a2cce0cfffc9161ead2 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Thu, 4 Dec 2025 21:05:26 +0800 Subject: [PATCH 172/214] fix summary table hardware --- .github/workflows/benchmark-tmpl.yml | 1 + runners/launch_b200-dgxc.sh | 2 +- runners/launch_b200-nvd.sh | 2 +- runners/launch_h100-cr.sh | 2 +- runners/launch_mi300x-amd.sh | 2 +- runners/launch_mi300x-cr.sh | 2 +- runners/launch_mi355x-amd.sh | 2 +- utils/collect_eval_results.py | 5 +---- 8 files changed, 8 insertions(+), 10 deletions(-) diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 8a8943628..d275f656c 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -134,6 +134,7 @@ jobs: - name: Launch job script env: RUNNER_NAME: ${{ runner.name }} + RUNNER_TYPE: ${{ inputs.runner }} RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_tp${{ env.TP }}_ep${{ env.EP_SIZE }}_dpa_${{ env.DP_ATTENTION }}_conc${{ env.CONC }}_${{ runner.name }} # Suppress per-job eval markdown from being appended to the step summary. # We'll publish a single combined eval table in the collection job instead. diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh index 25d09313e..8406d4bd0 100644 --- a/runners/launch_b200-dgxc.sh +++ b/runners/launch_b200-dgxc.sh @@ -41,7 +41,7 @@ docker run --rm --init --network host --name $server_name \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE -e DP_ATTENTION \ -e NCCL_GRAPH_REGISTER=0 \ -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ --e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e NUM_PROMPTS -e RUN_EVAL \ +-e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e NUM_PROMPTS -e RUN_EVAL -e RUNNER_TYPE \ --entrypoint=/bin/bash \ $(echo "$IMAGE" | sed 's/#/\//') \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh" diff --git a/runners/launch_b200-nvd.sh b/runners/launch_b200-nvd.sh index c6ae289bb..fac3063f2 100644 --- a/runners/launch_b200-nvd.sh +++ b/runners/launch_b200-nvd.sh @@ -42,7 +42,7 @@ docker run --rm --init --network host --name $server_name \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE -e DP_ATTENTION \ -e NCCL_GRAPH_REGISTER=0 \ -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ --e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e NUM_PROMPTS -e RUN_EVAL \ +-e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e NUM_PROMPTS -e RUN_EVAL -e RUNNER_TYPE \ --entrypoint=/bin/bash \ $(echo "$IMAGE" | sed 's/#/\//') \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh" diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh index 1070b6de0..0174087e4 100644 --- a/runners/launch_h100-cr.sh +++ b/runners/launch_h100-cr.sh @@ -10,7 +10,7 @@ docker run --rm --network=host --name=$server_name \ --runtime=nvidia --gpus=all --ipc=host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RUN_EVAL -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \ +-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RUN_EVAL -e RUNNER_TYPE -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \ -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ --entrypoint=/bin/bash \ $IMAGE \ diff --git a/runners/launch_mi300x-amd.sh b/runners/launch_mi300x-amd.sh index e5cea1ed6..55fffdb7c 100644 --- a/runners/launch_mi300x-amd.sh +++ b/runners/launch_mi300x-amd.sh @@ -14,7 +14,7 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \ --e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL \ +-e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL -e RUNNER_TYPE \ --entrypoint=/bin/bash \ $IMAGE \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi300x_docker.sh" diff --git a/runners/launch_mi300x-cr.sh b/runners/launch_mi300x-cr.sh index f864fec25..5bd6bd0e2 100644 --- a/runners/launch_mi300x-cr.sh +++ b/runners/launch_mi300x-cr.sh @@ -14,7 +14,7 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \ --e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL \ +-e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL -e RUNNER_TYPE \ --entrypoint=/bin/bash \ $IMAGE \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi300x_docker.sh" diff --git a/runners/launch_mi355x-amd.sh b/runners/launch_mi355x-amd.sh index f6507388b..e3f1ef8e9 100644 --- a/runners/launch_mi355x-amd.sh +++ b/runners/launch_mi355x-amd.sh @@ -36,7 +36,7 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT -e NUM_PROMPTS \ --e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL \ +-e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL -e RUNNER_TYPE \ --entrypoint=/bin/bash \ $IMAGE \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi355x_docker.sh" diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py index 6bb771b5d..1aeb80e30 100644 --- a/utils/collect_eval_results.py +++ b/utils/collect_eval_results.py @@ -293,12 +293,9 @@ def main(): else: continue - # Merge with meta - # Only use explicit hardware label written to meta_env.json ('hw') - hw_value = meta.get('hw', 'Unknown GPU') row = { 'model': m.get('model') or meta.get('model') or 'unknown', - 'hw': hw_value, + 'hw': meta.get('hw'), 'framework': (meta.get('framework') or 'unknown').lower(), 'precision': (meta.get('precision') or 'unknown').lower(), 'tp': int(meta.get('tp') or 1), From 61327ca20f78e9d60f5271ef1f039df65989c9b1 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Thu, 4 Dec 2025 21:20:34 +0800 Subject: [PATCH 173/214] fix summary table hardware 2 --- utils/collect_eval_results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py index 1aeb80e30..0c188473f 100644 --- a/utils/collect_eval_results.py +++ b/utils/collect_eval_results.py @@ -295,7 +295,7 @@ def main(): row = { 'model': m.get('model') or meta.get('model') or 'unknown', - 'hw': meta.get('hw'), + 'hw': (meta.get('hw') or 'unknown').upper(), 'framework': (meta.get('framework') or 'unknown').lower(), 'precision': (meta.get('precision') or 'unknown').lower(), 'tp': int(meta.get('tp') or 1), From 1cf2967d55b615bb75073d42fa80eed684230793 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 5 Dec 2025 23:06:47 +0800 Subject: [PATCH 174/214] final touches --- .github/workflows/eval-gms8k.yml | 64 ------------- .github/workflows/eval-tmpl.yml | 152 ------------------------------- benchmarks/benchmark_lib.sh | 1 + utils/collect_eval_results.py | 11 ++- 4 files changed, 9 insertions(+), 219 deletions(-) delete mode 100644 .github/workflows/eval-gms8k.yml delete mode 100644 .github/workflows/eval-tmpl.yml diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml deleted file mode 100644 index 5a7e7e823..000000000 --- a/.github/workflows/eval-gms8k.yml +++ /dev/null @@ -1,64 +0,0 @@ -name: Eval - GSM8K (PoC) - -on: - workflow_dispatch: - inputs: - exp-name: - description: "Experiment name (prefix selects docker script)" - required: false - type: string - default: "gptoss_gsm8k_poc" - image: - description: "Serving image" - required: false - type: string - default: "vllm/vllm-openai:v0.11.0" - model: - description: "Model" - required: false - type: string - default: "openai/gpt-oss-120b" - tp: - description: "Tensor Parallel Size" - required: false - type: string - default: "2" - port: - description: "Server port" - required: false - type: string - default: "8888" - num_fewshot: - description: "Fewshot k for GSM8K" - required: false - type: string - default: "5" - limit: - description: "Sample limit for GSM8K" - required: false - type: string - default: "1300" - push: - paths: - - '.github/workflows/eval-gms8k.yml' - - '.github/workflows/eval-tmpl.yml' - - 'benchmarks/benchmark_lib.sh' - -jobs: - eval: - uses: ./.github/workflows/eval-tmpl.yml - secrets: inherit - with: - runner: h100-cw_0 - image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }} - model: ${{ inputs.model || 'openai/gpt-oss-120b' }} - framework: vllm - precision: fp4 - exp-name: ${{ inputs.exp-name || 'gptoss_gsm8k_poc' }} - tp: '4' - ep: '1' - dp-attn: false - port: ${{ inputs.port || '8888' }} - eval-task: gsm8k - num-fewshot: ${{ inputs.num_fewshot || '5' }} - limit: ${{ inputs.limit || '200' }} diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml deleted file mode 100644 index e4e65a581..000000000 --- a/.github/workflows/eval-tmpl.yml +++ /dev/null @@ -1,152 +0,0 @@ -name: Template - Eval - -on: - workflow_call: - inputs: - runner: - required: true - type: string - image: - required: true - type: string - model: - required: true - type: string - framework: - required: true - type: string - precision: - required: true - type: string - exp-name: - required: true - type: string - tp: - required: true - type: string - ep: - required: false - type: string - default: '1' - dp-attn: - required: false - type: boolean - default: false - port: - required: false - type: string - default: '8888' - eval-task: - required: true - type: string - num-fewshot: - required: false - type: string - default: '5' - limit: - required: false - type: string - default: '200' - -env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - HF_HUB_CACHE: '/mnt/hf_hub_cache/' - EXP_NAME: ${{ inputs.exp-name }} - MODEL: ${{ inputs.model }} - IMAGE: ${{ inputs.image }} - FRAMEWORK: ${{ inputs.framework }} - PRECISION: ${{ inputs.precision }} - TP: ${{ inputs.tp }} - EP_SIZE: ${{ inputs.ep }} - DP_ATTENTION: ${{ inputs.dp-attn }} - PORT: ${{ inputs.port }} - EVAL_TASK: ${{ inputs['eval-task'] }} - NUM_FEWSHOT: ${{ inputs['num-fewshot'] }} - LIMIT: ${{ inputs.limit }} - # Keep eval outputs only under /tmp - EVAL_RESULT_DIR: /tmp/eval_out - CONC: '32' - MAX_MODEL_LEN: '4096' - ISL: 1024 - OSL: 1024 - RANDOM_RANGE_RATIO: '0.8' - RESULT_FILENAME: results - RUN_EVAL: true - -jobs: - eval: - runs-on: ${{ inputs.runner }} - timeout-minutes: 180 - name: "Eval ${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.precision }} tp=${{ inputs.tp }} task=${{ inputs['eval-task'] }} limit=${{ inputs.limit }}" - steps: - - name: Resource cleanup - run: | - sudo rm -rf /home/nvadmin/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out/ - # Helper to avoid indefinite hangs on flaky tools (Docker/Slurm) - safe_timeout() { - if command -v timeout >/dev/null 2>&1; then - timeout -k 5 30s "$@" - else - "$@" - fi - } - host=$(hostname) - if [[ "$host" == "b200-81" || "$host" == "b200-80" || "$host" == "b200-79" ]]; then - if command -v docker >/dev/null 2>&1; then - echo "[INFO] Running container-by-container cleanup on $host" - cids=$(safe_timeout docker ps -aq || true) - for cid in $cids; do - echo "[INFO] Cleaning container $cid" - safe_timeout docker stop -t 90 "$cid" || true - safe_timeout docker wait "$cid" >/dev/null 2>&1 || true - safe_timeout docker rm -f "$cid" >/dev/null 2>&1 || true - done - sleep 2 - if nvidia-smi --query-compute-apps=pid --format=csv,noheader | grep -q '[0-9]'; then - echo "[WARN] After stop, GPU still busy:" - nvidia-smi || true - fi - else - echo "[Docker] docker client not found; skipping cleanup" - fi - else - echo "[Docker] skipping docker cleanup on host $host" - fi - # Best-effort cleanup of prior eval outputs; do not block - - if command -v squeue >/dev/null 2>&1; then - echo "[Slurm] Cleaning up resources ..." - safe_timeout scancel -u "$USER" || true - # Wait up to 5 minutes for jobs to clear to avoid indefinite hang - end=$((SECONDS + 300)) - while [ $SECONDS -lt $end ]; do - queued=$(safe_timeout squeue -u "$USER" --noheader --format='%i' 2>/dev/null || true) - if [ -z "$queued" ]; then - break - fi - echo "$queued" | sed 's/^/[Slurm] pending job: /' || true - sleep 5 - done - # Final status; do not block - safe_timeout squeue -u "$USER" || true - if [ -n "$(safe_timeout squeue -u "$USER" --noheader --format='%i' 2>/dev/null || true)" ]; then - echo "[Slurm] Jobs still present after timeout; proceeding" - fi - fi - - - uses: actions/checkout@v5 - with: - fetch-depth: 0 - # Avoid aggressive workspace deletion if stale, rely on git reset/clean later - clean: true - - - name: Launch eval via runner script - env: - RUNNER_NAME: ${{ runner.name }} - RUN_MODE: eval - # Optional: structured filename if runner chooses to use it later - EVAL_RESULT_BASENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_${{ runner.name }} - run: | - bash ./runners/launch_${RUNNER_NAME%%_*}.sh - - # Intentionally no eval artifact uploads: eval outputs remain in /tmp only. diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 2e3ad10c4..99b56e20e 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -322,6 +322,7 @@ append_lm_eval_summary() { "framework": "${fw:-unknown}", "precision": "${prec:-unknown}", "tp": ${TP:-1}, + "conc": ${CONC:-1}, "ep": ${EP_SIZE:-1}, "dp_attention": ${dp_json}, "model": "${model_name:-}", diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py index 0c188473f..4f6f0dd30 100644 --- a/utils/collect_eval_results.py +++ b/utils/collect_eval_results.py @@ -300,6 +300,7 @@ def main(): 'precision': (meta.get('precision') or 'unknown').lower(), 'tp': int(meta.get('tp') or 1), 'ep': int(meta.get('ep') or 1), + 'conc': int(meta.get('conc') or 0), 'dp_attention': str(meta.get('dp_attention') or 'false'), 'task': m.get('task') or 'unknown', 'em_strict': m.get('strict'), @@ -312,14 +313,17 @@ def main(): rows.append(row) # Sort for stable output - rows.sort(key=lambda r: (r.get('model',''), r.get('hw',''), r.get('framework',''), r.get('precision',''), r.get('tp',0), r.get('ep',0))) + rows.sort(key=lambda r: ( + r.get('hw',''), r.get('framework',''), + r.get('precision',''), r.get('tp',0), r.get('conc',0) + )) if not rows: print('> No eval results found to summarize.') else: # Print Markdown summary table - print('| Model | Hardware | Framework | Precision | TP | EP | DPA | Task | EM Strict | EM Flexible | N (eff) |') - print('| :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |') + print('| Model | Hardware | Framework | Precision | TP | EP | Conc | DPA | Task | EM Strict | EM Flexible | N (eff) |') + print('| :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |') for r in rows: print( f"| {r['model']} " @@ -328,6 +332,7 @@ def main(): f"| {r['precision'].upper()} " f"| {r['tp']} " f"| {r['ep']} " + f"| {r['conc']} " f"| {r['dp_attention']} " f"| {r['task']} " f"| {pct(r['em_strict'])}{se(r['em_strict_se'])} " From 1d889b8d75d352b723b38049a3a095dca4385bf7 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 6 Dec 2025 11:25:50 +0800 Subject: [PATCH 175/214] Cleanup comments, ammend lighteval --- benchmarks/benchmark_lib.sh | 5 ++--- benchmarks/gptoss_fp4_h200_slurm.sh | 2 -- utils/evals/custom_gsm8k.py | 2 +- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 807d6b8eb..18438e8b9 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -160,7 +160,6 @@ run_benchmark_serving() { _install_lm_eval_deps() { python3 -m pip install -q --no-cache-dir "lm-eval[api]" || true - # Temporary: workaround issue by using main python3 -m pip install -q --no-cache-dir --no-deps \ "git+https://github.com/EleutherAI/lm-evaluation-harness.git@b315ef3b05176acc9732bb7fdec116abe1ecc476" || true } @@ -356,7 +355,7 @@ META # ------------------------------ _install_lighteval_deps() { - python3 -m pip install -q --no-cache-dir "lighteval[api]" "litellm" || true + python3 -m pip install -q --no-cache-dir "lighteval==0.13.0" "litellm==1.80.7" || true } # Patch lighteval's LiteLLMClient to handle reasoning content and Python name mangling @@ -615,7 +614,7 @@ run_lighteval_eval() { local base_url="http://0.0.0.0:${port}/v1" export OPENAI_API_KEY="${OPENAI_API_KEY:-EMPTY}" - local MODEL_ARGS="model_name=${lite_model},base_url=${base_url},api_key=${OPENAI_API_KEY},generation_parameters={temperature:0.0,top_p=1,max_new_tokens:2048},concurrent_requests=${concurrent_requests}" + local MODEL_ARGS="model_name=${lite_model},base_url=${base_url},api_key=${OPENAI_API_KEY},generation_parameters={temperature:0.0,top_p:1,max_new_tokens:2048},concurrent_requests=${concurrent_requests}" local TASK_SPEC="${task}|${num_fewshot}" # Respect absolute paths (e.g., /tmp/eval_out); otherwise write under /workspace diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh index ac19e6da8..b379e91e5 100644 --- a/benchmarks/gptoss_fp4_h200_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_slurm.sh @@ -39,8 +39,6 @@ export TORCH_CUDA_ARCH_LIST="9.0" PORT=$(( 8888 + $PORT_OFFSET )) MODEL_NAME=${MODEL##*/} -export TORCH_CUDA_ARCH_LIST="9.0" - PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT \ --config config.yaml \ --gpu-memory-utilization 0.9 \ diff --git a/utils/evals/custom_gsm8k.py b/utils/evals/custom_gsm8k.py index 5445f5732..ac6c0b9be 100644 --- a/utils/evals/custom_gsm8k.py +++ b/utils/evals/custom_gsm8k.py @@ -13,7 +13,7 @@ evaluation_splits=["test"], few_shots_split=None, few_shots_select="random_sampling_from_train", - generation_size=768, # raised this from 256 + generation_size=1024, # raised this from 256 metrics=[Metrics.expr_gold_metric], stop_sequence=None, # avoid early stop on "Question:" version=0, From 779a25793ca900e98dab3480be00874fb7b5ac8e Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 15 Dec 2025 11:48:18 -0600 Subject: [PATCH 176/214] pt 1 manual merge conflict fixes --- utils/evals/{READMEevals.md => EVALS.md} | 0 utils/matrix-logic/generate_sweep_configs.py | 1036 ----------- utils/matrix-logic/get_test_sweep_configs.py | 151 -- .../test_generate_sweep_configs.py | 1656 ----------------- utils/matrix_logic/generate_sweep_configs.py | 748 ++++++++ .../{matrix-logic => matrix_logic}/pytest.ini | 0 .../test_generate_sweep_configs.py | 948 ++++++++++ utils/matrix_logic/test_validation.py | 869 +++++++++ utils/matrix_logic/validation.py | 438 +++++ 9 files changed, 3003 insertions(+), 2843 deletions(-) rename utils/evals/{READMEevals.md => EVALS.md} (100%) delete mode 100644 utils/matrix-logic/generate_sweep_configs.py delete mode 100644 utils/matrix-logic/get_test_sweep_configs.py delete mode 100644 utils/matrix-logic/test_generate_sweep_configs.py create mode 100644 utils/matrix_logic/generate_sweep_configs.py rename utils/{matrix-logic => matrix_logic}/pytest.ini (100%) create mode 100644 utils/matrix_logic/test_generate_sweep_configs.py create mode 100644 utils/matrix_logic/test_validation.py create mode 100644 utils/matrix_logic/validation.py diff --git a/utils/evals/READMEevals.md b/utils/evals/EVALS.md similarity index 100% rename from utils/evals/READMEevals.md rename to utils/evals/EVALS.md diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py deleted file mode 100644 index a039a52f1..000000000 --- a/utils/matrix-logic/generate_sweep_configs.py +++ /dev/null @@ -1,1036 +0,0 @@ -import json -import yaml -import argparse -from pydantic import BaseModel, Field, ValidationError, ConfigDict -from typing import List - -# Field name constants -# Top-level config fields -FIELD_IMAGE = 'image' -FIELD_MODEL = 'model' -FIELD_MODEL_PREFIX = 'model-prefix' -FIELD_PRECISION = 'precision' -FIELD_FRAMEWORK = 'framework' -FIELD_RUNNER = 'runner' -FIELD_SEQ_LEN_CONFIGS = 'seq-len-configs' - -# Seq-len-config fields -FIELD_ISL = 'isl' -FIELD_OSL = 'osl' -FIELD_SEARCH_SPACE = 'search-space' - -# Search-space/benchmark fields -FIELD_TP = 'tp' -FIELD_CONC_START = 'conc-start' -FIELD_CONC_END = 'conc-end' -FIELD_EP = 'ep' -FIELD_DP_ATTN = 'dp-attn' - -# Matrix entry fields -FIELD_CONC = 'conc' -FIELD_MAX_MODEL_LEN = 'max-model-len' -FIELD_EXP_NAME = 'exp-name' - -# Eval -FIELD_RUN_EVAL = 'run-eval' - -seq_len_stoi = { - "1k1k": (1024, 1024), - "1k8k": (1024, 8192), - "8k1k": (8192, 1024) -} - -# Reverse mapping for exp-name generation -seq_len_itos = {v: k for k, v in seq_len_stoi.items()} - - -def seq_len_to_str(isl: int, osl: int) -> str: - """Convert sequence lengths to short string representation. - - Returns the short name (e.g., '1k1k') if it exists in the mapping, - otherwise returns 'isl_osl' format. - """ - return seq_len_itos.get((isl, osl), f"{isl}_{osl}") - - -class MatrixEntry(BaseModel): - """Pydantic model for validating matrix entry structure.""" - model_config = ConfigDict(extra='forbid', populate_by_name=True) - - image: str - model: str - precision: str - framework: str - runner: str - isl: int - osl: int - tp: int - ep: int - dp_attn: bool = Field(alias='dp-attn') - conc: int - max_model_len: int = Field(alias='max-model-len') - exp_name: str = Field(alias='exp-name') - run_eval: bool = Field(alias='run-eval', default=False) - - -def validate_matrix_output(matrix_values: List[dict]) -> List[dict]: - """Validate that matrix_values entries match the expected structure. - - Raises ValueError if any entry fails validation. - Returns the original list if all entries are valid. - """ - for i, entry in enumerate(matrix_values): - try: - MatrixEntry(**entry) - except ValidationError as e: - raise ValueError(f"Matrix entry at index {i} failed validation:\n{e}") - return matrix_values - -def mark_eval_entries(matrix_values: List[dict]) -> List[dict]: - """Mark entries that should run evaluation. - - For each unique (model, runner, framework, precision, isl, osl) combination: - - Mark highest TP with highest conc - - Mark lowest TP with highest conc - """ - from collections import defaultdict - - # Group entries by (model, runner, framework, precision, isl, osl) - # This ensures we compare within the same configuration, not across different frameworks - groups = defaultdict(list) - for i, entry in enumerate(matrix_values): - key = ( - entry[FIELD_MODEL], - entry[FIELD_RUNNER], - entry[FIELD_FRAMEWORK], - entry[FIELD_PRECISION], - entry[FIELD_ISL], - entry[FIELD_OSL] - ) - groups[key].append((i, entry)) - - # For each group, find highest TP/highest conc and lowest TP/highest conc - eval_indices = set() - for key, entries in groups.items(): - if not entries: - continue - - # Find min and max TP values - min_tp = min(e[FIELD_TP] for _, e in entries) - max_tp = max(e[FIELD_TP] for _, e in entries) - - # Find highest conc for highest TP - highest_tp_entries = [(i, e) for i, e in entries if e[FIELD_TP] == max_tp] - if highest_tp_entries: - max_conc_highest_tp = max(e[FIELD_CONC] for _, e in highest_tp_entries) - for i, e in highest_tp_entries: - if e[FIELD_CONC] == max_conc_highest_tp: - eval_indices.add(i) - - # Find highest conc for lowest TP (only if different from max_tp) - if min_tp != max_tp: - lowest_tp_entries = [(i, e) for i, e in entries if e[FIELD_TP] == min_tp] - if lowest_tp_entries: - max_conc_lowest_tp = max(e[FIELD_CONC] for _, e in lowest_tp_entries) - for i, e in lowest_tp_entries: - if e[FIELD_CONC] == max_conc_lowest_tp: - eval_indices.add(i) - - # Mark the selected entries - for i, entry in enumerate(matrix_values): - entry[FIELD_RUN_EVAL] = i in eval_indices - - return matrix_values - -def validate_master_configs_structure(all_config_data): - """Validate the structure of all master config entries. - - This validates that all required fields are present, have correct types, - and no extra fields exist. Should be called once after loading config files. - """ - for key, val in all_config_data.items(): - # Check for required top-level fields and their types - required_fields = { - FIELD_IMAGE: str, - FIELD_MODEL: str, - FIELD_MODEL_PREFIX: str, - FIELD_PRECISION: str, - FIELD_FRAMEWORK: str, - FIELD_RUNNER: str, - FIELD_SEQ_LEN_CONFIGS: list - } - - for field, expected_type in required_fields.items(): - if field not in val or val[field] is None: - raise ValueError( - f"Missing required field '{field}' for key '{key}'") - if not isinstance(val[field], expected_type): - raise ValueError( - f"Field '{field}' must be {expected_type.__name__} for key '{key}', got {type(val[field]).__name__}") - - seq_len_configs = val[FIELD_SEQ_LEN_CONFIGS] - if len(seq_len_configs) == 0: - raise ValueError( - f"'{FIELD_SEQ_LEN_CONFIGS}' must be a non-empty list for key '{key}'") - - # Validate each seq-len-config - for i, seq_config in enumerate(seq_len_configs): - # Check isl - if FIELD_ISL not in seq_config or seq_config[FIELD_ISL] is None: - raise ValueError( - f"Missing '{FIELD_ISL}' in seq-len-config[{i}] for key '{key}'") - if not isinstance(seq_config[FIELD_ISL], int): - raise ValueError( - f"'{FIELD_ISL}' must be int in seq-len-config[{i}] for key '{key}'") - - # Check osl - if FIELD_OSL not in seq_config or seq_config[FIELD_OSL] is None: - raise ValueError( - f"Missing '{FIELD_OSL}' in seq-len-config[{i}] for key '{key}'") - if not isinstance(seq_config[FIELD_OSL], int): - raise ValueError( - f"'{FIELD_OSL}' must be int in seq-len-config[{i}] for key '{key}'") - - bmk_space = seq_config.get(FIELD_SEARCH_SPACE) - if not bmk_space or not isinstance(bmk_space, list) or len(bmk_space) == 0: - raise ValueError( - f"Missing or invalid '{FIELD_SEARCH_SPACE}' in seq-len-config[{i}] for key '{key}'") - - # Validate each benchmark in search-space - for j, bmk in enumerate(bmk_space): - # Define allowed fields - allowed_fields = {FIELD_TP, FIELD_CONC_START, - FIELD_CONC_END, FIELD_EP, FIELD_DP_ATTN} - required_bmk_fields = {FIELD_TP: int, - FIELD_CONC_START: int, FIELD_CONC_END: int} - optional_bmk_fields = {FIELD_EP: int, FIELD_DP_ATTN: bool} - - # Check for extra fields - extra_fields = set(bmk.keys()) - allowed_fields - if extra_fields: - raise ValueError( - f"Extra fields {extra_fields} in search-space[{j}] of seq-len-config[{i}] for key '{key}'") - - # Validate required fields - for field, expected_type in required_bmk_fields.items(): - if field not in bmk or bmk[field] is None: - raise ValueError( - f"Missing '{field}' in search-space[{j}] of seq-len-config[{i}] for key '{key}'") - if not isinstance(bmk[field], expected_type): - raise ValueError( - f"'{field}' must be {expected_type.__name__} in search-space[{j}] of seq-len-config[{i}] for key '{key}'") - - # Validate optional fields if they exist - for field, expected_type in optional_bmk_fields.items(): - if field in bmk and bmk[field] is not None: - if not isinstance(bmk[field], expected_type): - raise ValueError( - f"'{field}' must be {expected_type.__name__} in search-space[{j}] of seq-len-config[{i}] for key '{key}'") - - -def generate_full_sweep(args, all_config_data): - """Generate full sweep configurations with optional filtering. - - Supports filtering by model prefix, precision, framework, runner type, and sequence lengths. - Supports test mode to only run highest TP with lowest concurrency. - - All filters are optional - can generate sweeps for all configs or filter by specific criteria. - - Assumes all_config_data has been validated by validate_config_structure(). - """ - # Validate runner types if specified - if args.runner_type: - if not args.runner_config: - raise ValueError( - "--runner-config is required when --runner-type is specified") - - try: - with open(args.runner_config, 'r') as f: - runner_config = yaml.safe_load(f) - except FileNotFoundError: - raise ValueError( - f"Runner config file '{args.runner_config}' does not exist.") - - valid_runner_types = set(runner_config.keys()) - invalid_runners = set(args.runner_type) - valid_runner_types - if invalid_runners: - raise ValueError( - f"Invalid runner type(s): {invalid_runners}. " - f"Valid runner types are: {', '.join(sorted(valid_runner_types))}") - - matrix_values = [] - - # Convert seq-lens to set of (isl, osl) tuples for filtering - seq_lens_filter = None - if args.seq_lens: - seq_lens_filter = {seq_len_stoi[sl] for sl in args.seq_lens} - - for key, val in all_config_data.items(): - # Filter by model prefix if specified - if args.model_prefix: - if not any(key.startswith(prefix) for prefix in args.model_prefix): - continue - - # Filter by precision if specified - if args.precision and val[FIELD_PRECISION] not in args.precision: - continue - - # Filter by framework if specified - if args.framework and val[FIELD_FRAMEWORK] not in args.framework: - continue - - # Filter by runner type if specified - if args.runner_type and val[FIELD_RUNNER] not in args.runner_type: - continue - - seq_len_configs = val[FIELD_SEQ_LEN_CONFIGS] - image = val[FIELD_IMAGE] - model = val[FIELD_MODEL] - precision = val[FIELD_PRECISION] - framework = val[FIELD_FRAMEWORK] - runner = val[FIELD_RUNNER] - model_code = val[FIELD_MODEL_PREFIX] - - for seq_config in seq_len_configs: - isl = seq_config[FIELD_ISL] - osl = seq_config[FIELD_OSL] - - # Filter by sequence lengths if specified - if seq_lens_filter and (isl, osl) not in seq_lens_filter: - continue - - bmk_space = seq_config[FIELD_SEARCH_SPACE] - - if args.test_mode: - # In test mode, use highest TP with lowest concurrency - highest_tp_bmk = max(bmk_space, key=lambda x: x[FIELD_TP]) - tp = highest_tp_bmk[FIELD_TP] - conc = highest_tp_bmk[FIELD_CONC_START] - ep = highest_tp_bmk.get(FIELD_EP) - dp_attn = highest_tp_bmk.get(FIELD_DP_ATTN) - - seq_len_str = seq_len_to_str(isl, osl) - entry = { - FIELD_IMAGE: image, - FIELD_MODEL: model, - FIELD_PRECISION: precision, - FIELD_FRAMEWORK: framework, - FIELD_RUNNER: runner, - FIELD_ISL: isl, - FIELD_OSL: osl, - FIELD_TP: tp, - FIELD_EP: 1, # Default - FIELD_DP_ATTN: False, # Default - FIELD_CONC: conc, - FIELD_MAX_MODEL_LEN: isl + osl + 200, - FIELD_EXP_NAME: f"{model_code}_{seq_len_str}", - } - - if ep is not None: - entry[FIELD_EP] = ep - if dp_attn is not None: - entry[FIELD_DP_ATTN] = dp_attn - - matrix_values.append(entry) - else: - # Full sweep mode - for bmk in bmk_space: - tp = bmk[FIELD_TP] - conc_start = bmk[FIELD_CONC_START] - conc_end = bmk[FIELD_CONC_END] - ep = bmk.get(FIELD_EP) - dp_attn = bmk.get(FIELD_DP_ATTN) - - conc = conc_start - while conc <= conc_end: - seq_len_str = seq_len_to_str(isl, osl) - entry = { - FIELD_IMAGE: image, - FIELD_MODEL: model, - FIELD_PRECISION: precision, - FIELD_FRAMEWORK: framework, - FIELD_RUNNER: runner, - FIELD_ISL: isl, - FIELD_OSL: osl, - FIELD_TP: tp, - FIELD_CONC: conc, - FIELD_MAX_MODEL_LEN: isl + osl + 200, - FIELD_EP: 1, # Default - FIELD_DP_ATTN: False, # Default - FIELD_EXP_NAME: f"{model_code}_{seq_len_str}", - } - - if ep is not None: - entry[FIELD_EP] = ep - if dp_attn is not None: - entry[FIELD_DP_ATTN] = dp_attn - - matrix_values.append(entry) - - if conc == conc_end: - break - conc *= args.step_size - if conc > conc_end: - conc = conc_end - - if len(matrix_values) == 0: - error_msg = "No configs found matching filters:" - if args.model_prefix: - error_msg += f" model-prefix={args.model_prefix}" - if args.precision: - error_msg += f" precision={args.precision}" - if args.framework: - error_msg += f" framework={args.framework}" - if args.runner_type: - error_msg += f" runner-type={args.runner_type}" - if seq_lens_filter: - error_msg += f" seq-lens={args.seq_lens}" - raise ValueError(error_msg) - - return matrix_values - - -def generate_test_config(args, all_config_data): - """Generate test configurations for a specific key. - - Assumes all_config_data has been validated by validate_config_structure(). - """ - try: - with open(args.runner_config, 'r') as f: - runner_config = yaml.safe_load(f) - except FileNotFoundError as e: - raise ValueError( - f"Runner config file '{args.runner_config}' does not exist.") - - val = all_config_data.get(args.key) - - if not val: - raise ValueError( - f"Specified key '{args.key}' does not exist in config files.") - - # Extract model code from config - model_code = val[FIELD_MODEL_PREFIX] - - runner_nodes = runner_config.get(val[FIELD_RUNNER], []) - if args.runner_node and args.runner_node not in runner_nodes: - raise ValueError( - f"Runner node '{args.runner_node}' is not compatible with config '{args.key}' which runs on runner type '{val[FIELD_RUNNER]}'. Available runner nodes for this config are '{', '.join(runner_nodes)}'.") - - seq_len_configs = val[FIELD_SEQ_LEN_CONFIGS] - image = val[FIELD_IMAGE] - model = val[FIELD_MODEL] - precision = val[FIELD_PRECISION] - framework = val[FIELD_FRAMEWORK] - # Use default runner or specific runner node if input by user - runner = val[FIELD_RUNNER] if not args.runner_node else args.runner_node - - # Convert seq-lens to set of (isl, osl) tuples for filtering - seq_lens_filter = None - if args.seq_lens: - seq_lens_filter = {seq_len_stoi[sl] for sl in args.seq_lens} - - matrix_values = [] - - # Process each sequence length configuration - for seq_config in seq_len_configs: - isl = seq_config[FIELD_ISL] - osl = seq_config[FIELD_OSL] - - # Filter by sequence lengths if specified - if seq_lens_filter and (isl, osl) not in seq_lens_filter: - continue - - bmk_space = seq_config[FIELD_SEARCH_SPACE] - - for bmk in bmk_space: - tp = bmk[FIELD_TP] - conc_start = bmk[FIELD_CONC_START] - conc_end = bmk[FIELD_CONC_END] - ep = bmk.get(FIELD_EP) - dp_attn = bmk.get(FIELD_DP_ATTN) - - # In test mode, only use the lowest concurrency (conc_start) - if args.test_mode: - entry = { - FIELD_IMAGE: image, - FIELD_MODEL: model, - FIELD_PRECISION: precision, - FIELD_FRAMEWORK: framework, - FIELD_RUNNER: runner, - FIELD_ISL: isl, - FIELD_OSL: osl, - FIELD_TP: tp, - FIELD_EP: 1, # Default, - FIELD_DP_ATTN: False, # Default - FIELD_CONC: conc_start, - FIELD_MAX_MODEL_LEN: isl + osl, - FIELD_EXP_NAME: f"{model_code}_test", - } - - # Add optional fields if they exist - if ep is not None: - entry[FIELD_EP] = ep - if dp_attn is not None: - entry[FIELD_DP_ATTN] = dp_attn - - matrix_values.append(entry) - else: - # Generate entries for each concurrency value in the range - conc = conc_start - while conc <= conc_end: - seq_len_str = seq_len_to_str(isl, osl) - entry = { - FIELD_IMAGE: image, - FIELD_MODEL: model, - FIELD_PRECISION: precision, - FIELD_FRAMEWORK: framework, - FIELD_RUNNER: runner, - FIELD_ISL: isl, - FIELD_OSL: osl, - FIELD_TP: tp, - FIELD_EP: 1, # Default, - FIELD_DP_ATTN: False, # Default - FIELD_CONC: conc, - FIELD_MAX_MODEL_LEN: isl + osl, - FIELD_EXP_NAME: f"{model_code}_{seq_len_str}", - } - - # Add optional fields if they exist - if ep is not None: - entry[FIELD_EP] = ep - if dp_attn is not None: - entry[FIELD_DP_ATTN] = dp_attn - - matrix_values.append(entry) - - if conc == conc_end: - break - conc *= args.step_size - if conc > conc_end: - conc = conc_end - - return matrix_values - - -def generate_runner_model_sweep_config(args, all_config_data): - """Generate runner-model sweep configurations. - - Assumes all_config_data has been validated by validate_config_structure(). - """ - try: - with open(args.runner_config, 'r') as f: - runner_config = yaml.safe_load(f) - except FileNotFoundError as e: - raise ValueError( - f"Runner config file '{args.runner_config}' does not exist.") - - runner_nodes = runner_config.get(args.runner_type) - - if not runner_nodes: - raise ValueError( - f"Runner '{args.runner_type}' does not exist in runner config '{args.runner_config}'. Must choose from existing runner types: '{', '.join(runner_config.keys())}'.") - - # Filter runner nodes if filter is specified - if args.runner_node_filter: - runner_nodes = [node for node in runner_nodes if args.runner_node_filter in node] - if not runner_nodes: - raise ValueError( - f"No runner nodes found matching filter '{args.runner_node_filter}' for runner type '{args.runner_type}'.") - - matrix_values = [] - for key, val in all_config_data.items(): - # Only consider configs with specified runner - if val[FIELD_RUNNER] != args.runner_type: - continue - - # Get model code for exp_name - model_code = val[FIELD_MODEL_PREFIX] - - # Find 1k1k config - target_config = None - for config in val[FIELD_SEQ_LEN_CONFIGS]: - if config[FIELD_ISL] == 1024 and config[FIELD_OSL] == 1024: - target_config = config - break - - highest_tp_bmk = max(target_config[FIELD_SEARCH_SPACE], key=lambda x: x[FIELD_TP]) - # Since we are just testing, pick the highest TP for this config and just test - # on that TP with the lowest concurrency available - highest_tp = highest_tp_bmk[FIELD_TP] - lowest_conc = highest_tp_bmk[FIELD_CONC_START] - - ep = highest_tp_bmk.get(FIELD_EP) - dp_attn = highest_tp_bmk.get(FIELD_DP_ATTN) - - for node in runner_nodes: - entry = { - FIELD_IMAGE: val[FIELD_IMAGE], - FIELD_MODEL: val[FIELD_MODEL], - FIELD_PRECISION: val[FIELD_PRECISION], - FIELD_FRAMEWORK: val[FIELD_FRAMEWORK], - # Add one entry for each node under specified runner type - FIELD_RUNNER: node, - # Again, just use 1k1k since this is just meant to smoke test all runners - FIELD_ISL: 1024, - FIELD_OSL: 1024, - FIELD_TP: highest_tp, - FIELD_EP: 1, # Default, - FIELD_DP_ATTN: False, # Default - FIELD_CONC: lowest_conc, - FIELD_MAX_MODEL_LEN: 2048, - FIELD_EXP_NAME: f"{model_code}_test", - } - - # Add optional fields if they exist - if ep is not None: - entry[FIELD_EP] = ep - if dp_attn is not None: - entry[FIELD_DP_ATTN] = dp_attn - - matrix_values.append(entry) - - return matrix_values - - -def generate_custom_test(args): - """Generate single 1k1k job for custom inputs. - """ - try: - with open(args.runner_config, 'r') as f: - runner_config = yaml.safe_load(f) - except FileNotFoundError as e: - raise ValueError( - f"Runner config file '{args.runner_config}' does not exist.") - - found_runner_label = False - for runner_type, runner_nodes in runner_config.items(): - if args.runner_label == runner_type or args.runner_label in runner_nodes: - found_runner_label = True - - if not found_runner_label: - raise ValueError(f"Unable to find specified runner label '{args.runner_label}'.") - - if not runner_nodes: - raise ValueError( - f"Runner '{args.runner_type}' does not exist in runner config '{args.runner_config}'. Must choose from existing runner types: '{', '.join(runner_config.keys())}'.") - - return [ - { - FIELD_IMAGE: args.image, - FIELD_MODEL: args.model, - FIELD_PRECISION: args.precision, - FIELD_FRAMEWORK: args.framework, - FIELD_RUNNER: args.runner_label, - # Again, just use 1k1k since this is just meant to smoke test all runners - FIELD_ISL: 1024, - FIELD_OSL: 1024, - FIELD_TP: 8, - FIELD_EP: 1, - FIELD_DP_ATTN: False, - FIELD_CONC: 4, - FIELD_EXP_NAME: args.exp_name, - FIELD_MAX_MODEL_LEN: 2048, - } - ] - - -def generate_runner_sweep_config(args, all_config_data): - """Generate runner sweep configurations. - - Assumes all_config_data has been validated by validate_config_structure(). - """ - try: - with open(args.runner_config, 'r') as f: - runner_config = yaml.safe_load(f) - except FileNotFoundError as e: - raise ValueError( - f"Runner config file '{args.runner_config}' does not exist.") - - if not runner_config.get(args.runner_type): - raise ValueError( - f"Runner '{args.runner_type}' does not exist in runner config '{args.runner_config}'. Must choose from existing runner types: '{', '.join(runner_config.keys())}'.") - - - matrix_values = [] - for key, val in all_config_data.items(): - # Only consider configs with specified runner - if not key.startswith(args.model_prefix): - continue - - if not val[FIELD_RUNNER] == args.runner_type: - continue - - # Optionally filter by precision and framework - if (args.precision and val[FIELD_PRECISION] != args.precision) or (args.framework and val[FIELD_FRAMEWORK] != args.framework): - continue - - # Get model code for exp_name - model_code = val[FIELD_MODEL_PREFIX] - - runner_nodes = runner_config.get(val[FIELD_RUNNER]) - if not runner_nodes: - raise ValueError( - f"Runner '{val[FIELD_RUNNER]}' does not exist in runner config '{args.runner_config}'. Must choose from existing runner types: '{', '.join(runner_config.keys())}'.") - - # Find 1k1k config - target_config = None - for config in val[FIELD_SEQ_LEN_CONFIGS]: - if config[FIELD_ISL] == 1024 and config[FIELD_OSL] == 1024: - target_config = config - break - - highest_tp_bmk = max(target_config[FIELD_SEARCH_SPACE], key=lambda x: x[FIELD_TP]) - # Since we are just testing, pick the highest TP for this config and just test - # on that TP with the lowest concurrency available - highest_tp = highest_tp_bmk[FIELD_TP] - lowest_conc = highest_tp_bmk[FIELD_CONC_START] - - ep = highest_tp_bmk.get(FIELD_EP) - dp_attn = highest_tp_bmk.get(FIELD_DP_ATTN) - - for node in runner_nodes: - entry = { - FIELD_IMAGE: val[FIELD_IMAGE], - FIELD_MODEL: val[FIELD_MODEL], - FIELD_PRECISION: val[FIELD_PRECISION], - FIELD_FRAMEWORK: val[FIELD_FRAMEWORK], - # Add one entry for each node under specified runner type - FIELD_RUNNER: node, - # Again, just use 1k1k since this is just meant to smoke test all runners - FIELD_ISL: 1024, - FIELD_OSL: 1024, - FIELD_TP: highest_tp, - FIELD_EP: 1, # Default, - FIELD_DP_ATTN: False, # Default - FIELD_CONC: lowest_conc, - FIELD_EXP_NAME: f"{model_code}_test", - FIELD_MAX_MODEL_LEN: 2048, - } - - # Add optional fields if they exist - if ep is not None: - entry[FIELD_EP] = ep - if dp_attn is not None: - entry[FIELD_DP_ATTN] = dp_attn - - matrix_values.append(entry) - - if len(matrix_values) == 0: - error_msg = f"No configs found matching model prefix '{args.model_prefix}'" - if args.precision: - error_msg += f", precision '{args.precision}'" - if args.framework: - error_msg += f", framework '{args.framework}'" - raise ValueError(error_msg + ".") - - return matrix_values - - -def load_config_files(config_files): - """Load and merge configuration files.""" - all_config_data = {} - for config_file in config_files: - try: - with open(config_file, 'r') as f: - config_data = yaml.safe_load(f) - assert isinstance( - config_data, dict), f"Config file '{config_file}' must contain a dictionary" - - # Check for duplicate keys, this is only in place to prevent against the very unlikely - # case where an entry in one config accidentally/purposefully tries to override an entry in another config - duplicate_keys = set(all_config_data.keys()) & set( - config_data.keys()) - if duplicate_keys: - raise ValueError( - f"Duplicate configuration keys found in '{config_file}': {', '.join(sorted(duplicate_keys))}" - ) - - all_config_data.update(config_data) - except FileNotFoundError: - raise ValueError(f"Input file '{config_file}' does not exist.") - - return all_config_data - - -def main(): - # Create parent parser with common arguments - parent_parser = argparse.ArgumentParser(add_help=False) - parent_parser.add_argument( - '--config-files', - nargs='+', - required=True, - help='One or more configuration files (YAML format)' - ) - parent_parser.add_argument( - '--run-evals', - action='store_true', - help='Opt-in flag to mark a subset of generated configs to run evals. When omitted, no evals run.' - ) - - # Create main parser - parser = argparse.ArgumentParser( - description='Generate benchmark configurations from YAML config files' - ) - - # Create subparsers for subcommands - subparsers = parser.add_subparsers( - dest='command', - required=True, - help='Available commands' - ) - - # Subcommand: full-sweep - full_sweep_parser = subparsers.add_parser( - 'full-sweep', - parents=[parent_parser], - add_help=False, - help='Generate full sweep configurations with optional filtering by model, precision, framework, runner type, and sequence lengths' - ) - full_sweep_parser.add_argument( - '--model-prefix', - nargs='+', - required=False, - help='Model prefix(es) to filter configurations (optional, can specify multiple)' - ) - full_sweep_parser.add_argument( - '--precision', - nargs='+', - required=False, - help='Precision(s) to filter by (e.g., fp4, fp8) (optional, can specify multiple)' - ) - full_sweep_parser.add_argument( - '--framework', - nargs='+', - required=False, - help='Framework(s) to filter by (e.g., vllm, trt, sglang) (optional, can specify multiple)' - ) - full_sweep_parser.add_argument( - '--runner-type', - nargs='+', - required=False, - help='Runner type(s) to filter by (e.g., h200, h100) (optional, can specify multiple)' - ) - full_sweep_parser.add_argument( - '--runner-config', - required=False, - help='Configuration file holding runner information (required if --runner-type is specified)' - ) - full_sweep_parser.add_argument( - '--seq-lens', - nargs='+', - choices=list(seq_len_stoi.keys()), - required=False, - help=f"Sequence length configurations to include: {', '.join(seq_len_stoi.keys())}. If not specified, all sequence lengths are included." - ) - full_sweep_parser.add_argument( - '--step-size', - type=int, - default=2, - help='Step size for concurrency values (default: 2)' - ) - full_sweep_parser.add_argument( - '--test-mode', - action='store_true', - help='Test mode: only run highest TP with lowest concurrency for each matching config' - ) - full_sweep_parser.add_argument( - '-h', '--help', - action='help', - help='Show this help message and exit' - ) - - # Subcommand: test-config - test_config_parser = subparsers.add_parser( - 'test-config', - parents=[parent_parser], - add_help=False, - help='Given a config key, run that configuration as specified. Optionally specify --test-mode to only run one parallelism-concurrency pair for the config.' - ) - test_config_parser.add_argument( - '--runner-config', - required=True, - help='Configuration file holding runner information' - ) - test_config_parser.add_argument( - '--key', - required=True, - help='Configuration key to use' - ) - test_config_parser.add_argument( - '--runner-node', - required=False, - help='Specific runner node to use' - ) - test_config_parser.add_argument( - '--seq-lens', - nargs='+', - choices=list(seq_len_stoi.keys()), - required=False, - help=f"Sequence length configurations to include: {', '.join(seq_len_stoi.keys())}. If not specified, all sequence lengths are included." - ) - test_config_parser.add_argument( - '--step-size', - type=int, - default=2, - help='Step size for concurrency values (default: 2)' - ) - test_config_parser.add_argument( - '--test-mode', - action='store_true', - help='Generate only the lowest concurrency value for each TP level' - ) - test_config_parser.add_argument( - '-h', '--help', - action='help', - help='Show this help message and exit' - ) - - # Subcommand: runner-model-sweep - test_config_parser = subparsers.add_parser( - 'runner-model-sweep', - parents=[parent_parser], - add_help=False, - help='Given a runner type, find all configurations matching the type, and run that configuration on all individual runner nodes for the specified runner type. This is meant to validate that all runner nodes work on all configurations for a runner type. For instance, to validate that all configs that specify an h200 runner successfully run across all h200 runner nodes.' - ) - test_config_parser.add_argument( - '--runner-type', - required=True, - help='Runner type (e.g., b200-trt, h100)' - ) - test_config_parser.add_argument( - '--runner-config', - required=True, - help='Configuration file holding runner information' - ) - test_config_parser.add_argument( - '--runner-node-filter', - required=False, - help='Filter runner nodes by substring match (e.g., "mi300x-amd" to only include nodes containing that string)' - ) - test_config_parser.add_argument( - '-h', '--help', - action='help', - help='Show this help message and exit' - ) - - # Subcommand: runner-sweep - test_config_parser = subparsers.add_parser( - 'runner-sweep', - parents=[parent_parser], - add_help=False, - help='Given a model (and optionally a precision and framework), find all configurations matching the inputs, and run those configurations across all compatible runner nodes. This is meant to validate all runner nodes that should run a particular model can. For instance, this should be used to validate that all runners nodes that should run gptoss-120b actually do so successfully.' - ) - test_config_parser.add_argument( - '--runner-type', - required=True, - help='Runner type (e.g., b200-trt, h100)' - ) - test_config_parser.add_argument( - '--model-prefix', - required=True, - help='Model prefix (e.g., 70b)' - ) - test_config_parser.add_argument( - '--precision', - required=False, - help='Precision to filter by (e.g., fp4) (optional)' - ) - test_config_parser.add_argument( - '--framework', - required=False, - help='Framework to filter by (e.g., trt) (optional)' - ) - test_config_parser.add_argument( - '--runner-config', - required=True, - help='Configuration file holding runner information' - ) - test_config_parser.add_argument( - '-h', '--help', - action='help', - help='Show this help message and exit' - ) - - # Subcommand: custom - test_config_parser = subparsers.add_parser( - 'custom', - parents=[parent_parser], - add_help=False, - help='Enter custom values' - ) - test_config_parser.add_argument( - '--runner-label', - required=True, - help='Label associated with runner on which to launch the corresponding job (e.g., h200, h200-nv_1, etc.)' - ) - test_config_parser.add_argument( - '--image', - required=True, - help='Image to run the benchmark (e.g., openai/gpt-oss-120b)' - ) - test_config_parser.add_argument( - '--model', - required=True, - help='Model to run (e.g., vllm/vllm-openai:latest)' - ) - test_config_parser.add_argument( - '--framework', - required=True, - help='Framework to run on (e.g., vllm, trt, sglang)' - ) - test_config_parser.add_argument( - '--precision', - required=True, - help='Precision to run (e.g., fp4, fp8)' - ) - test_config_parser.add_argument( - '--exp-name', - required=True, - help='Experiment name (e.g., 70b_test)' - ) - test_config_parser.add_argument( - '--runner-config', - required=True, - help='Configuration file holding runner information' - ) - test_config_parser.add_argument( - '-h', '--help', - action='help', - help='Show this help message and exit' - ) - - args = parser.parse_args() - - # Load and validate configuration files - all_config_data = load_config_files(args.config_files) - validate_master_configs_structure(all_config_data) - - # Route to appropriate function based on subcommand - if args.command == 'full-sweep': - matrix_values = generate_full_sweep(args, all_config_data) - elif args.command == 'test-config': - matrix_values = generate_test_config(args, all_config_data) - elif args.command == 'runner-model-sweep': - matrix_values = generate_runner_model_sweep_config( - args, all_config_data) - elif args.command == 'runner-sweep': - matrix_values = generate_runner_sweep_config( - args, all_config_data) - elif args.command == 'custom': - matrix_values = generate_custom_test(args) - else: - parser.error(f"Unknown command: {args.command}") - - # Choose eval (opt-in via --run-evals) - if args.run_evals: - matrix_values = mark_eval_entries(matrix_values) - - # Validate output before printing - validate_matrix_output(matrix_values) - - print(json.dumps(matrix_values)) - return matrix_values - - -if __name__ == "__main__": - main() diff --git a/utils/matrix-logic/get_test_sweep_configs.py b/utils/matrix-logic/get_test_sweep_configs.py deleted file mode 100644 index 4df4a51eb..000000000 --- a/utils/matrix-logic/get_test_sweep_configs.py +++ /dev/null @@ -1,151 +0,0 @@ -import json -import yaml -import sys -import argparse - -seq_len_stoi = { - "1k1k": (1024, 1024), - "1k8k": (1024, 8192), - "8k1k": (8192, 1024) -} - -def main(): - parser = argparse.ArgumentParser( - description='Generate benchmark matrix from a specific configuration key' - ) - parser.add_argument( - '--config-files', - nargs='+', - required=True, - help='One or more configuration files (YAML format)' - ) - parser.add_argument( - '--key', - required=True, - help='Configuration key to use' - ) - parser.add_argument( - '--seq-lens', - nargs='+', - choices=list(seq_len_stoi.keys()), - required=False, - help=f"Sequence length configurations to include: {', '.join(seq_len_stoi.keys())}. If not specified, all sequence lengths are included." - ) - parser.add_argument( - '--step-size', - type=int, - default=2, - help='Step size for concurrency values (default: 2)' - ) - - args = parser.parse_args() - - # Convert seq-lens to set of (isl, osl) tuples for filtering - seq_lens_filter = None - if args.seq_lens: - seq_lens_filter = {seq_len_stoi[sl] for sl in args.seq_lens} - - # Load and merge all config files - all_config_data = {} - for config_file in args.config_files: - try: - with open(config_file, 'r') as f: - config_data = yaml.safe_load(f) - assert isinstance(config_data, dict), f"Config file '{config_file}' must contain a dictionary" - - # Check for duplicate keys - duplicate_keys = set(all_config_data.keys()) & set(config_data.keys()) - if duplicate_keys: - raise ValueError( - f"Duplicate configuration keys found in '{config_file}': {', '.join(sorted(duplicate_keys))}" - ) - - all_config_data.update(config_data) - except FileNotFoundError: - raise ValueError(f"Input file '{config_file}' does not exist.") - - # Check if the key exists - if args.key not in all_config_data: - available_keys = ', '.join(sorted(all_config_data.keys())) - raise ValueError( - f"Key '{args.key}' not found in configuration files. " - f"Available keys: {available_keys}" - ) - - val = all_config_data[args.key] - - # Validate required fields - seq_len_configs = val.get('seq-len-configs') - assert seq_len_configs, f"Missing 'seq-len-configs' for key '{args.key}'" - - image = val.get('image') - model = val.get('model') - precision = val.get('precision') - framework = val.get('framework') - runner = val.get('runner') - - assert None not in (image, model, precision, framework, runner), \ - f"Missing required fields (image, model, precision, framework, runner) for key '{args.key}'" - - matrix_values = [] - - # Process each sequence length configuration - for seq_config in seq_len_configs: - isl = seq_config.get('isl') - osl = seq_config.get('osl') - - assert None not in (isl, osl), \ - f"Missing 'isl' or 'osl' in seq-len-config for key '{args.key}'" - - # Filter by sequence lengths if specified - if seq_lens_filter and (isl, osl) not in seq_lens_filter: - continue - - bmk_space = seq_config.get('bmk-space') - assert bmk_space, f"Missing 'bmk-space' in seq-len-config for key '{args.key}'" - - for bmk in bmk_space: - tp = bmk.get('tp') - conc_start = bmk.get('conc-start') - conc_end = bmk.get('conc-end') - ep = bmk.get('ep') - dp_attn = bmk.get('dp-attn') - - assert None not in (tp, conc_start, conc_end), \ - f"Missing 'tp', 'conc-start', or 'conc-end' in bmk-space for key '{args.key}'" - - # Generate entries for each concurrency value in the range - conc = conc_start - while conc <= conc_end: - entry = { - 'image': image, - 'model': model, - 'precision': precision, - 'framework': framework, - 'runner': runner, - 'isl': isl, - 'osl': osl, - 'tp': tp, - 'conc': conc, - 'max-model-len': isl + osl, - } - - # Add optional fields if they exist - if ep is not None: - entry['ep'] = ep - if dp_attn is not None: - entry['dp-attn'] = dp_attn - - matrix_values.append(entry) - - if conc == conc_end: - break - conc *= args.step_size - if conc > conc_end: - conc = conc_end - - print(json.dumps(matrix_values)) - return matrix_values - -if __name__ == "__main__": - main() diff --git a/utils/matrix-logic/test_generate_sweep_configs.py b/utils/matrix-logic/test_generate_sweep_configs.py deleted file mode 100644 index c184ecbab..000000000 --- a/utils/matrix-logic/test_generate_sweep_configs.py +++ /dev/null @@ -1,1656 +0,0 @@ -import pytest -import yaml -from unittest.mock import patch -from generate_sweep_configs import ( - validate_master_configs_structure, - validate_matrix_output, - seq_len_to_str, - generate_full_sweep, - generate_test_config, - generate_runner_model_sweep_config, - generate_runner_sweep_config, - generate_custom_test, - load_config_files, - main, - MatrixEntry, -) - - -# Fixtures for test config files -@pytest.fixture -def sample_master_config(): - """Sample master config with valid entries.""" - return { - "70b-fp8-vllm": { - "image": "vllm/vllm-openai:v0.10.2", - "model": "meta-llama/Llama-3-70b", - "model-prefix": "70b", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [ - {"tp": 4, "conc-start": 1, "conc-end": 4}, - {"tp": 8, "conc-start": 2, "conc-end": 8, "ep": 2, "dp-attn": True} - ] - }, - { - "isl": 1024, - "osl": 8192, - "search-space": [ - {"tp": 8, "conc-start": 1, "conc-end": 2} - ] - } - ] - }, - "8b-fp4-trt": { - "image": "nvcr.io/nvidia/tritonserver:24.01", - "model": "meta-llama/Llama-3-8b", - "model-prefix": "8b", - "precision": "fp4", - "framework": "trt", - "runner": "h100", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [ - {"tp": 2, "conc-start": 4, "conc-end": 16} - ] - } - ] - }, - "gptoss-120b-fp8-vllm": { - "image": "vllm/vllm-openai:latest", - "model": "openai/gpt-oss-120b", - "model-prefix": "gptoss", - "precision": "fp8", - "framework": "vllm", - "runner": "h200-trt", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [ - {"tp": 8, "conc-start": 1, "conc-end": 4} - ] - } - ] - } - } - - -@pytest.fixture -def sample_runner_config(): - """Sample runner config.""" - return { - "h200": ["h200-nv_1", "h200-nv_2"], - "h100": ["h100-aws_1"], - "h200-trt": ["h200-trt_1", "h200-trt_2", "h200-trt_3"] - } - - -@pytest.fixture -def temp_config_files(tmp_path, sample_master_config, sample_runner_config): - """Create temporary config files.""" - master_file = tmp_path / "master.yaml" - runner_file = tmp_path / "runners.yaml" - - with open(master_file, 'w') as f: - yaml.dump(sample_master_config, f) - - with open(runner_file, 'w') as f: - yaml.dump(sample_runner_config, f) - - return str(master_file), str(runner_file) - - -@pytest.fixture -def invalid_master_config(): - """Master config with validation errors.""" - return { - "missing-field": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - # Missing precision, framework, runner, seq-len-configs - } - } - - -# Tests for seq_len_to_str -def test_seq_len_to_str_with_mapping(): - """Test seq_len_to_str with known mappings.""" - assert seq_len_to_str(1024, 1024) == "1k1k" - assert seq_len_to_str(1024, 8192) == "1k8k" - assert seq_len_to_str(8192, 1024) == "8k1k" - - -def test_seq_len_to_str_without_mapping(): - """Test seq_len_to_str fallback for unknown mappings.""" - assert seq_len_to_str(2048, 4096) == "2048_4096" - assert seq_len_to_str(512, 512) == "512_512" - - -# Tests for MatrixEntry validation -def test_matrix_entry_valid(): - """Test valid MatrixEntry.""" - entry = { - "image": "test:latest", - "model": "test/model", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "isl": 1024, - "osl": 1024, - "tp": 8, - "ep": 1, - "dp-attn": False, - "conc": 4, - "max-model-len": 2048, - "exp-name": "test_exp" - } - result = MatrixEntry(**entry) - assert result.image == "test:latest" - assert result.tp == 8 - - -def test_matrix_entry_missing_field(): - """Test MatrixEntry with missing required field.""" - entry = { - "image": "test:latest", - "model": "test/model", - # Missing other required fields - } - with pytest.raises(Exception): # Pydantic ValidationError - MatrixEntry(**entry) - - -def test_matrix_entry_wrong_type(): - """Test MatrixEntry with wrong type.""" - entry = { - "image": "test:latest", - "model": "test/model", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "isl": "not_an_int", # Wrong type - "osl": 1024, - "tp": 8, - "ep": 1, - "dp-attn": False, - "conc": 4, - "max-model-len": 2048, - "exp-name": "test_exp" - } - with pytest.raises(Exception): # Pydantic ValidationError - MatrixEntry(**entry) - - -def test_matrix_entry_extra_field(): - """Test MatrixEntry with extra field (should be forbidden).""" - entry = { - "image": "test:latest", - "model": "test/model", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "isl": 1024, - "osl": 1024, - "tp": 8, - "ep": 1, - "dp-attn": False, - "conc": 4, - "max-model-len": 2048, - "exp-name": "test_exp", - "extra-field": "should_fail" - } - with pytest.raises(Exception): # Pydantic ValidationError - MatrixEntry(**entry) - - -# Tests for validate_matrix_output -def test_validate_matrix_output_valid(): - """Test validate_matrix_output with valid entries.""" - entries = [ - { - "image": "test:latest", - "model": "test/model", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "isl": 1024, - "osl": 1024, - "tp": 8, - "ep": 1, - "dp-attn": False, - "conc": 4, - "max-model-len": 2048, - "exp-name": "test_exp" - } - ] - result = validate_matrix_output(entries) - assert result == entries - - -def test_validate_matrix_output_invalid(): - """Test validate_matrix_output with invalid entry.""" - entries = [ - { - "image": "test:latest", - "model": "test/model", - # Missing required fields - } - ] - with pytest.raises(ValueError, match="Matrix entry at index 0 failed validation"): - validate_matrix_output(entries) - - -def test_validate_matrix_output_multiple_entries(): - """Test validate_matrix_output with multiple entries.""" - entries = [ - { - "image": "test:latest", - "model": "test/model", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "isl": 1024, - "osl": 1024, - "tp": 8, - "ep": 1, - "dp-attn": False, - "conc": 4, - "max-model-len": 2048, - "exp-name": "test_exp" - }, - { - "image": "test2:latest", - "model": "test2/model", - "precision": "fp4", - "framework": "trt", - "runner": "h100", - "isl": 1024, - "osl": 1024, - "tp": 4, - "ep": 2, - "dp-attn": True, - "conc": 8, - "max-model-len": 2048, - "exp-name": "test_exp2" - } - ] - result = validate_matrix_output(entries) - assert len(result) == 2 - - -# Tests for validate_master_configs_structure -def test_validate_master_configs_structure_valid(sample_master_config): - """Test validation of valid master config.""" - validate_master_configs_structure(sample_master_config) - - -def test_validate_master_configs_structure_missing_field(): - """Test validation with missing required field.""" - config = { - "test-key": { - "image": "test:latest", - "model-prefix": "test", - # Missing other required fields - } - } - with pytest.raises(ValueError, match="Missing required field"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_wrong_type(): - """Test validation with wrong field type.""" - config = { - "test-key": { - "image": 123, # Should be string - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [] - } - } - with pytest.raises(ValueError, match="must be str"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_empty_seq_len_configs(): - """Test validation with empty seq-len-configs.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [] - } - } - with pytest.raises(ValueError, match="must be a non-empty list"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_invalid_search_space(): - """Test validation with invalid search-space.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [ - {"tp": 8} # Missing conc-start and conc-end - ] - } - ] - } - } - with pytest.raises(ValueError, match="Missing 'conc-start'"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_missing_search_space(): - """Test validation with missing search-space.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024 - # Missing search-space - } - ] - } - } - with pytest.raises(ValueError, match="Missing or invalid 'search-space'"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_search_space_not_list(): - """Test validation with search-space not being a list.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": "not_a_list" - } - ] - } - } - with pytest.raises(ValueError, match="Missing or invalid 'search-space'"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_extra_fields_in_search_space(): - """Test validation with extra fields in search-space.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [ - { - "tp": 8, - "conc-start": 1, - "conc-end": 4, - "invalid-field": "value" - } - ] - } - ] - } - } - with pytest.raises(ValueError, match="Extra fields"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_missing_isl(): - """Test validation with missing isl.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "osl": 1024, - "search-space": [{"tp": 8, "conc-start": 1, "conc-end": 4}] - } - ] - } - } - with pytest.raises(ValueError, match="Missing 'isl'"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_wrong_isl_type(): - """Test validation with wrong isl type.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": "not_int", - "osl": 1024, - "search-space": [{"tp": 8, "conc-start": 1, "conc-end": 4}] - } - ] - } - } - with pytest.raises(ValueError, match="'isl' must be int"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_missing_osl(): - """Test validation with missing osl.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "search-space": [{"tp": 8, "conc-start": 1, "conc-end": 4}] - } - ] - } - } - with pytest.raises(ValueError, match="Missing 'osl'"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_wrong_osl_type(): - """Test validation with wrong osl type.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": "not_int", - "search-space": [{"tp": 8, "conc-start": 1, "conc-end": 4}] - } - ] - } - } - with pytest.raises(ValueError, match="'osl' must be int"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_wrong_tp_type(): - """Test validation with wrong tp type.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [{"tp": "not_int", "conc-start": 1, "conc-end": 4}] - } - ] - } - } - with pytest.raises(ValueError, match="'tp' must be int"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_wrong_conc_start_type(): - """Test validation with wrong conc-start type.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [{"tp": 8, "conc-start": "not_int", "conc-end": 4}] - } - ] - } - } - with pytest.raises(ValueError, match="'conc-start' must be int"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_wrong_conc_end_type(): - """Test validation with wrong conc-end type.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [{"tp": 8, "conc-start": 1, "conc-end": "not_int"}] - } - ] - } - } - with pytest.raises(ValueError, match="'conc-end' must be int"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_wrong_ep_type(): - """Test validation with wrong ep type.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [{"tp": 8, "conc-start": 1, "conc-end": 4, "ep": "not_int"}] - } - ] - } - } - with pytest.raises(ValueError, match="'ep' must be int"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_wrong_dp_attn_type(): - """Test validation with wrong dp-attn type.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [{"tp": 8, "conc-start": 1, "conc-end": 4, "dp-attn": "not_bool"}] - } - ] - } - } - with pytest.raises(ValueError, match="'dp-attn' must be bool"): - validate_master_configs_structure(config) - - -# Tests for load_config_files -def test_load_config_files_valid(temp_config_files): - """Test loading valid config files.""" - master_file, _ = temp_config_files - result = load_config_files([master_file]) - assert len(result) == 3 - assert "70b-fp8-vllm" in result - - -def test_load_config_files_multiple(tmp_path, sample_master_config): - """Test loading multiple config files.""" - file1 = tmp_path / "config1.yaml" - file2 = tmp_path / "config2.yaml" - - config1 = {"70b-fp8-vllm": sample_master_config["70b-fp8-vllm"]} - config2 = {"8b-fp4-trt": sample_master_config["8b-fp4-trt"]} - - with open(file1, 'w') as f: - yaml.dump(config1, f) - with open(file2, 'w') as f: - yaml.dump(config2, f) - - result = load_config_files([str(file1), str(file2)]) - assert len(result) == 2 - - -def test_load_config_files_not_found(): - """Test loading non-existent config file.""" - with pytest.raises(ValueError, match="does not exist"): - load_config_files(["/nonexistent/file.yaml"]) - - -def test_load_config_files_duplicate_keys(tmp_path, sample_master_config): - """Test loading files with duplicate keys.""" - file1 = tmp_path / "config1.yaml" - file2 = tmp_path / "config2.yaml" - - config1 = {"70b-fp8-vllm": sample_master_config["70b-fp8-vllm"]} - config2 = {"70b-fp8-vllm": sample_master_config["70b-fp8-vllm"]} # Duplicate - - with open(file1, 'w') as f: - yaml.dump(config1, f) - with open(file2, 'w') as f: - yaml.dump(config2, f) - - with pytest.raises(ValueError, match="Duplicate configuration keys"): - load_config_files([str(file1), str(file2)]) - - -# Tests for generate_full_sweep -def test_generate_full_sweep_basic(sample_master_config, temp_config_files): - """Test basic full sweep generation.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["70b"] - seq_lens = ["1k1k"] - step_size = 2 - precision = None - framework = None - runner_type = None - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - assert len(result) > 0 - assert all(entry['exp-name'].startswith('70b_1k1k') for entry in result) - assert all(entry['isl'] == 1024 and entry['osl'] == 1024 for entry in result) - - -def test_generate_full_sweep_with_optionals(sample_master_config, temp_config_files): - """Test full sweep with optional ep and dp-attn.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["70b"] - seq_lens = ["1k1k"] - step_size = 2 - precision = None - framework = None - runner_type = None - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - # Find entry with tp=8 which should have ep=2 and dp-attn=True - tp8_entries = [e for e in result if e['tp'] == 8] - assert len(tp8_entries) > 0 - assert all(e['ep'] == 2 for e in tp8_entries) - assert all(e['dp-attn'] == True for e in tp8_entries) - - -def test_generate_full_sweep_no_matches(sample_master_config, temp_config_files): - """Test full sweep with no matching configs.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["nonexistent"] - seq_lens = ["1k1k"] - step_size = 2 - precision = None - framework = None - runner_type = None - test_mode = False - runner_config = runner_file - - with pytest.raises(ValueError, match="No configs found matching filters"): - generate_full_sweep(Args(), sample_master_config) - - -def test_generate_full_sweep_different_seq_len(sample_master_config, temp_config_files): - """Test full sweep with different sequence length.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["70b"] - seq_lens = ["1k8k"] - step_size = 2 - precision = None - framework = None - runner_type = None - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - assert len(result) > 0 - assert all(entry['isl'] == 1024 and entry['osl'] == 8192 for entry in result) - - -def test_generate_full_sweep_step_size(sample_master_config, temp_config_files): - """Test full sweep with different step size.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["8b"] - seq_lens = ["1k1k"] - step_size = 4 - precision = None - framework = None - runner_type = None - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - # Should have entries at conc=4, 8, 16 (step_size=4, conc-start=4, conc-end=16) - conc_values = sorted(set(e['conc'] for e in result)) - assert 4 in conc_values - assert 16 in conc_values - - -def test_generate_full_sweep_seq_len_not_in_config(temp_config_files): - """Test full sweep when requested seq-len is not in config.""" - _, runner_file = temp_config_files - - config = { - "test-fp8-vllm": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 8192, - "osl": 1024, # Only has 8k1k, not 1k1k - "search-space": [ - {"tp": 4, "conc-start": 1, "conc-end": 4} - ] - } - ] - } - } - - class Args: - model_prefix = ["test"] - seq_lens = ["1k1k"] # Requesting 1k1k but config only has 8k1k - step_size = 2 - precision = None - framework = None - runner_type = None - test_mode = False - runner_config = runner_file - - # Should raise error since no matching seq-len - with pytest.raises(ValueError, match="No configs found matching filters"): - generate_full_sweep(Args(), config) - - -def test_generate_full_sweep_concurrency_overshoot(temp_config_files): - """Test full sweep when concurrency step overshoots end value.""" - _, runner_file = temp_config_files - - config = { - "test-fp8-vllm": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [ - {"tp": 4, "conc-start": 1, "conc-end": 5} # 1, 3*2=6 overshoots, clamps to 5 - ] - } - ] - } - } - - class Args: - model_prefix = ["test"] - seq_lens = ["1k1k"] - step_size = 3 # Will overshoot: 1, 3, 9 (clamped to 5) - precision = None - framework = None - runner_type = None - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), config) - conc_values = sorted(set(e['conc'] for e in result)) - # Should have 1, 3, 5 (5 is the clamped value) - assert conc_values == [1, 3, 5] - - -# Tests for generate_full_sweep with filters -def test_generate_full_sweep_no_filters(sample_master_config, temp_config_files): - """Test filtered sweep with no filters.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = None - precision = None - framework = None - runner_type = None - seq_lens = None - step_size = 2 - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - assert len(result) > 0 - - -def test_generate_full_sweep_with_filters_model_prefix(sample_master_config, temp_config_files): - """Test filtered sweep with model prefix filter.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["70b"] - precision = None - framework = None - runner_type = None - seq_lens = None - step_size = 2 - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - assert all("70b" in entry['exp-name'] for entry in result) - - -def test_generate_full_sweep_with_filters_multiple_filters(sample_master_config, temp_config_files): - """Test filtered sweep with multiple filters.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["70b"] - precision = ["fp8"] - framework = ["vllm"] - runner_type = None - seq_lens = ["1k1k"] - step_size = 2 - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - assert len(result) > 0 - assert all(entry['precision'] == 'fp8' for entry in result) - assert all(entry['framework'] == 'vllm' for entry in result) - - -def test_generate_full_sweep_with_filters_test_mode(sample_master_config, temp_config_files): - """Test filtered sweep in test mode.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["70b"] - precision = None - framework = None - runner_type = None - seq_lens = ["1k1k"] - step_size = 2 - test_mode = True - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - # In test mode, should only get one entry per seq-len (highest TP, lowest conc) - assert len(result) == 1 # Only one config matches 70b with 1k1k - assert result[0]['tp'] == 8 # Highest TP - assert '70b_1k1k' in result[0]['exp-name'] - - -def test_generate_full_sweep_with_filters_runner_type_validation(sample_master_config, temp_config_files): - """Test filtered sweep with invalid runner type.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = None - precision = None - framework = None - runner_type = ["invalid-runner"] - seq_lens = None - step_size = 2 - test_mode = False - runner_config = runner_file - - with pytest.raises(ValueError, match="Invalid runner type"): - generate_full_sweep(Args(), sample_master_config) - - -def test_generate_full_sweep_with_filters_runner_type_no_config(sample_master_config): - """Test filtered sweep with runner type but no config file.""" - class Args: - model_prefix = None - precision = None - framework = None - runner_type = ["h200"] - seq_lens = None - step_size = 2 - test_mode = False - runner_config = None - - with pytest.raises(ValueError, match="runner-config is required"): - generate_full_sweep(Args(), sample_master_config) - - -def test_generate_full_sweep_with_filters_multiple_runner_types(sample_master_config, temp_config_files): - """Test filtered sweep with multiple runner types.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = None - precision = None - framework = None - runner_type = ["h200", "h100"] - seq_lens = ["1k1k"] - step_size = 2 - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - runners = set(entry['runner'] for entry in result) - assert 'h200' in runners or 'h100' in runners - - -def test_generate_full_sweep_with_filters_no_matches(sample_master_config, temp_config_files): - """Test filtered sweep with no matching configs.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["nonexistent"] - precision = None - framework = None - runner_type = None - seq_lens = None - step_size = 2 - test_mode = False - runner_config = runner_file - - with pytest.raises(ValueError, match="No configs found matching filters"): - generate_full_sweep(Args(), sample_master_config) - - -def test_generate_full_sweep_with_filters_concurrency_overshoot(temp_config_files): - """Test filtered sweep when concurrency step overshoots end value.""" - _, runner_file = temp_config_files - - config = { - "test-fp8-vllm": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [ - {"tp": 4, "conc-start": 2, "conc-end": 7} # 2, 8 overshoots, clamps to 7 - ] - } - ] - } - } - - class Args: - model_prefix = None - precision = None - framework = None - runner_type = None - seq_lens = None - step_size = 4 # Will overshoot: 2, 8 (clamped to 7) - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), config) - conc_values = sorted(set(e['conc'] for e in result)) - # Should have 2, 7 (7 is the clamped value) - assert 2 in conc_values - assert 7 in conc_values - - -# Tests for generate_test_config -def test_generate_test_config_basic(sample_master_config, temp_config_files): - """Test basic test config generation.""" - _, runner_file = temp_config_files - - class Args: - key = "70b-fp8-vllm" - runner_config = runner_file - runner_node = "h200-nv_1" - seq_lens = None - step_size = 2 - test_mode = False - - result = generate_test_config(Args(), sample_master_config) - assert len(result) > 0 - - -def test_generate_test_config_test_mode(sample_master_config, temp_config_files): - """Test test config in test mode.""" - _, runner_file = temp_config_files - - class Args: - key = "70b-fp8-vllm" - runner_config = runner_file - runner_node = "h200-nv_1" - seq_lens = ["1k1k"] - step_size = 2 - test_mode = True - - result = generate_test_config(Args(), sample_master_config) - # In test mode, should only use lowest concurrency - assert all(entry['conc'] == 1 or entry['conc'] == 2 for entry in result) - - -def test_generate_test_config_specific_runner_node(sample_master_config, temp_config_files): - """Test test config with specific runner node.""" - _, runner_file = temp_config_files - - class Args: - key = "70b-fp8-vllm" - runner_config = runner_file - runner_node = "h200-nv_1" - seq_lens = None - step_size = 2 - test_mode = False - - result = generate_test_config(Args(), sample_master_config) - assert all(entry['runner'] == 'h200-nv_1' for entry in result) - - -def test_generate_test_config_invalid_key(sample_master_config, temp_config_files): - """Test test config with invalid key.""" - _, runner_file = temp_config_files - - class Args: - key = "nonexistent-key" - runner_config = runner_file - runner_node = None - seq_lens = None - step_size = 2 - test_mode = False - - with pytest.raises(ValueError, match="does not exist in config files"): - generate_test_config(Args(), sample_master_config) - - -def test_generate_test_config_invalid_runner_node(sample_master_config, temp_config_files): - """Test test config with invalid runner node.""" - _, runner_file = temp_config_files - - class Args: - key = "70b-fp8-vllm" - runner_config = runner_file - runner_node = "invalid-node" - seq_lens = None - step_size = 2 - test_mode = False - - with pytest.raises(ValueError, match="is not compatible"): - generate_test_config(Args(), sample_master_config) - - -def test_generate_test_config_missing_runner_config(sample_master_config): - """Test test config with missing runner config file.""" - class Args: - key = "70b-fp8-vllm" - runner_config = "/nonexistent/file.yaml" - runner_node = None - seq_lens = None - step_size = 2 - test_mode = False - - with pytest.raises(ValueError, match="does not exist"): - generate_test_config(Args(), sample_master_config) - - -def test_generate_test_config_concurrency_overshoot(temp_config_files): - """Test test config when concurrency step overshoots end value.""" - _, runner_file = temp_config_files - - config = { - "test-fp8-vllm": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [ - {"tp": 4, "conc-start": 1, "conc-end": 6} - ] - } - ] - } - } - - class Args: - key = "test-fp8-vllm" - runner_config = runner_file - runner_node = "h200-nv_1" - seq_lens = None - step_size = 4 # Will overshoot: 1, 4, 16 (clamped to 6) - test_mode = False - - result = generate_test_config(Args(), config) - conc_values = sorted(set(e['conc'] for e in result)) - assert 1 in conc_values - assert 4 in conc_values - assert 6 in conc_values - - -# Tests for generate_runner_model_sweep_config -def test_generate_runner_model_sweep_config(sample_master_config, temp_config_files): - """Test runner-model sweep config generation.""" - _, runner_file = temp_config_files - - class Args: - runner_type = "h200" - runner_config = runner_file - runner_node_filter = None - - result = generate_runner_model_sweep_config(Args(), sample_master_config) - assert len(result) > 0 - # Should have entries for each runner node under h200 - runners = set(entry['runner'] for entry in result) - assert 'h200-nv_1' in runners - assert 'h200-nv_2' in runners - - -def test_generate_runner_model_sweep_config_invalid_runner(sample_master_config, temp_config_files): - """Test runner-model sweep with invalid runner type.""" - _, runner_file = temp_config_files - - class Args: - runner_type = "invalid-runner" - runner_config = runner_file - runner_node_filter = None - - with pytest.raises(ValueError, match="does not exist in runner config"): - generate_runner_model_sweep_config(Args(), sample_master_config) - - -def test_generate_runner_model_sweep_config_with_node_filter(sample_master_config, temp_config_files): - """Test runner-model sweep with runner node filter.""" - _, runner_file = temp_config_files - - class Args: - runner_type = "h200" - runner_config = runner_file - runner_node_filter = "nv_1" - - result = generate_runner_model_sweep_config(Args(), sample_master_config) - # Should only have entries for h200-nv_1 - runners = set(entry['runner'] for entry in result) - assert 'h200-nv_1' in runners - assert 'h200-nv_2' not in runners - - -def test_generate_runner_model_sweep_config_with_node_filter_multiple_matches(sample_master_config, temp_config_files): - """Test runner-model sweep with runner node filter matching multiple nodes.""" - _, runner_file = temp_config_files - - class Args: - runner_type = "h200" - runner_config = runner_file - runner_node_filter = "nv" # Should match both nv_1 and nv_2 - - result = generate_runner_model_sweep_config(Args(), sample_master_config) - runners = set(entry['runner'] for entry in result) - assert 'h200-nv_1' in runners - assert 'h200-nv_2' in runners - - -def test_generate_runner_model_sweep_config_with_node_filter_no_matches(sample_master_config, temp_config_files): - """Test runner-model sweep with runner node filter that matches no nodes.""" - _, runner_file = temp_config_files - - class Args: - runner_type = "h200" - runner_config = runner_file - runner_node_filter = "nonexistent" - - with pytest.raises(ValueError, match="No runner nodes found matching filter"): - generate_runner_model_sweep_config(Args(), sample_master_config) - - -def test_generate_runner_model_sweep_config_without_node_filter(sample_master_config, temp_config_files): - """Test runner-model sweep without runner node filter (default behavior).""" - _, runner_file = temp_config_files - - class Args: - runner_type = "h200" - runner_config = runner_file - runner_node_filter = None - - result = generate_runner_model_sweep_config(Args(), sample_master_config) - # Should have entries for all h200 nodes - runners = set(entry['runner'] for entry in result) - assert 'h200-nv_1' in runners - assert 'h200-nv_2' in runners - - -# Tests for generate_runner_sweep_config -def test_generate_runner_sweep_config(sample_master_config, temp_config_files): - """Test runner sweep config generation.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = "70b" - runner_type = "h200" - precision = None - framework = None - runner_config = runner_file - - result = generate_runner_sweep_config(Args(), sample_master_config) - assert len(result) > 0 - - -def test_generate_runner_sweep_config_with_filters(sample_master_config, temp_config_files): - """Test runner sweep with precision and framework filters.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = "70b" - runner_type = "h200" - precision = "fp8" - framework = "vllm" - runner_config = runner_file - - result = generate_runner_sweep_config(Args(), sample_master_config) - assert all(entry['precision'] == 'fp8' for entry in result) - assert all(entry['framework'] == 'vllm' for entry in result) - - -def test_generate_runner_sweep_config_no_matches(sample_master_config, temp_config_files): - """Test runner sweep with no matching configs.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = "nonexistent" - runner_type = "h200" - precision = None - framework = None - runner_config = runner_file - - with pytest.raises(ValueError, match="No configs found matching"): - generate_runner_sweep_config(Args(), sample_master_config) - - -# Tests for generate_custom_test -def test_generate_custom_test(temp_config_files): - """Test custom test generation.""" - _, runner_file = temp_config_files - - class Args: - runner_label = "h200" - image = "vllm/vllm-openai:latest" - model = "test/model" - framework = "vllm" - precision = "fp8" - exp_name = "custom_test" - runner_config = runner_file - - result = generate_custom_test(Args()) - assert len(result) == 1 - assert result[0]['image'] == "vllm/vllm-openai:latest" - assert result[0]['exp-name'] == "custom_test" - - -def test_generate_custom_test_invalid_runner(temp_config_files): - """Test custom test with invalid runner label.""" - _, runner_file = temp_config_files - - class Args: - runner_label = "invalid-runner" - image = "vllm/vllm-openai:latest" - model = "test/model" - framework = "vllm" - precision = "fp8" - exp_name = "custom_test" - runner_config = runner_file - - with pytest.raises(ValueError, match="Unable to find specified runner label"): - generate_custom_test(Args()) - - -# Tests for main function -def test_main_full_sweep(temp_config_files): - """Test main function with full-sweep command.""" - master_file, _ = temp_config_files - - test_args = [ - "generate_sweep_configs.py", - "full-sweep", - "--config-files", master_file, - "--seq-lens", "1k1k", - "--model-prefix", "70b", - "--step-size", "2" - ] - - with patch('sys.argv', test_args): - result = main() - assert len(result) > 0 - - -def test_main_full_sweep_with_filters(temp_config_files): - """Test main function with full-sweep command with filters.""" - master_file, runner_file = temp_config_files - - test_args = [ - "generate_sweep_configs.py", - "full-sweep", - "--config-files", master_file, - "--runner-config", runner_file, - "--model-prefix", "70b", - "--precision", "fp8", - "--test-mode" - ] - - with patch('sys.argv', test_args): - result = main() - assert len(result) > 0 - - -def test_main_test_config(temp_config_files): - """Test main function with test-config command.""" - master_file, runner_file = temp_config_files - - test_args = [ - "generate_sweep_configs.py", - "test-config", - "--config-files", master_file, - "--runner-config", runner_file, - "--key", "70b-fp8-vllm", - "--runner-node", "h200-nv_1", - "--test-mode" - ] - - with patch('sys.argv', test_args): - result = main() - assert len(result) > 0 - - -def test_main_runner_model_sweep(temp_config_files): - """Test main function with runner-model-sweep command.""" - master_file, runner_file = temp_config_files - - test_args = [ - "generate_sweep_configs.py", - "runner-model-sweep", - "--config-files", master_file, - "--runner-config", runner_file, - "--runner-type", "h200" - ] - - with patch('sys.argv', test_args): - result = main() - assert len(result) > 0 - - -def test_main_runner_model_sweep_with_node_filter(temp_config_files): - """Test main function with runner-model-sweep command with node filter.""" - master_file, runner_file = temp_config_files - - test_args = [ - "generate_sweep_configs.py", - "runner-model-sweep", - "--config-files", master_file, - "--runner-config", runner_file, - "--runner-type", "h200", - "--runner-node-filter", "nv_1" - ] - - with patch('sys.argv', test_args): - result = main() - assert len(result) > 0 - runners = set(entry['runner'] for entry in result) - assert 'h200-nv_1' in runners - assert 'h200-nv_2' not in runners - - -def test_main_runner_sweep(temp_config_files): - """Test main function with runner-sweep command.""" - master_file, runner_file = temp_config_files - - test_args = [ - "generate_sweep_configs.py", - "runner-sweep", - "--config-files", master_file, - "--runner-config", runner_file, - "--runner-type", "h200", - "--model-prefix", "70b" - ] - - with patch('sys.argv', test_args): - result = main() - assert len(result) > 0 - - -def test_main_custom(temp_config_files): - """Test main function with custom command.""" - master_file, runner_file = temp_config_files - - test_args = [ - "generate_sweep_configs.py", - "custom", - "--config-files", master_file, - "--runner-config", runner_file, - "--runner-label", "h200", - "--image", "test:latest", - "--model", "test/model", - "--framework", "vllm", - "--precision", "fp8", - "--exp-name", "custom_test" - ] - - with patch('sys.argv', test_args): - result = main() - assert len(result) == 1 - - -def test_main_invalid_config_structure(tmp_path): - """Test main with invalid config structure.""" - invalid_file = tmp_path / "invalid.yaml" - with open(invalid_file, 'w') as f: - yaml.dump({"key": {"image": "test"}}, f) # Missing required fields - - test_args = [ - "generate_sweep_configs.py", - "full-sweep", - "--config-files", str(invalid_file), - "--seq-lens", "1k1k", - "--model-prefix", "test" - ] - - with patch('sys.argv', test_args): - with pytest.raises(ValueError): - main() - - -def test_main_validation_failure(temp_config_files, monkeypatch): - """Test main with validation failure on output.""" - master_file, _ = temp_config_files - - # Monkey patch validate_matrix_output to always fail - def mock_validate(entries): - raise ValueError("Validation failed") - - monkeypatch.setattr('generate_sweep_configs.validate_matrix_output', mock_validate) - - test_args = [ - "generate_sweep_configs.py", - "full-sweep", - "--config-files", master_file, - "--seq-lens", "1k1k", - "--model-prefix", "70b" - ] - - with patch('sys.argv', test_args): - with pytest.raises(ValueError, match="Validation failed"): - main() - - -# Edge case tests -def test_concurrency_step_reaches_exact_end(sample_master_config, temp_config_files): - """Test that concurrency stepping reaches exact end value.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["8b"] - seq_lens = ["1k1k"] - step_size = 2 - precision = None - framework = None - runner_type = None - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - # conc-start=4, conc-end=16, step=2 should give 4,8,16 - conc_values = sorted(set(e['conc'] for e in result)) - assert 16 in conc_values - - -def test_multiple_model_prefixes_filtered_sweep(sample_master_config, temp_config_files): - """Test filtered sweep with multiple model prefixes.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["70b", "8b"] - precision = None - framework = None - runner_type = None - seq_lens = ["1k1k"] - step_size = 2 - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - exp_names = [e['exp-name'] for e in result] - assert any('70b' in name for name in exp_names) - assert any('8b' in name for name in exp_names) - - -def test_seq_len_filter_multiple(sample_master_config, temp_config_files): - """Test filtering with multiple sequence lengths.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["70b"] - precision = None - framework = None - runner_type = None - seq_lens = ["1k1k", "1k8k"] - step_size = 2 - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - seq_lens = set((e['isl'], e['osl']) for e in result) - assert (1024, 1024) in seq_lens - assert (1024, 8192) in seq_lens - - -def test_default_ep_dp_attn_values(sample_master_config, temp_config_files): - """Test that default ep and dp-attn values are set correctly.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["8b"] - seq_lens = ["1k1k"] - step_size = 2 - precision = None - framework = None - runner_type = None - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - # 8b config doesn't specify ep/dp-attn, so should use defaults - assert all(e['ep'] == 1 for e in result) - assert all(e['dp-attn'] == False for e in result) - - -def test_max_model_len_calculation(sample_master_config, temp_config_files): - """Test that max-model-len is calculated correctly.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["70b"] - seq_lens = ["1k8k"] - step_size = 2 - precision = None - framework = None - runner_type = None - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - # isl=1024, osl=8192, so max-model-len should be 1024+8192+200=9416 - assert all(e['max-model-len'] == 9416 for e in result) - - -if __name__ == "__main__": - pytest.main([__file__, "-v", "--cov=generate_sweep_configs", "--cov-report=term-missing"]) diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py new file mode 100644 index 000000000..cba89c448 --- /dev/null +++ b/utils/matrix_logic/generate_sweep_configs.py @@ -0,0 +1,748 @@ +import json +import argparse +import sys +from pathlib import Path + +# Ensure sibling modules are importable regardless of how script is invoked +sys.path.insert(0, str(Path(__file__).resolve().parent)) + +from validation import ( + validate_matrix_entry, + load_config_files, + load_runner_file, + Fields +) + +seq_len_stoi = { + "1k1k": (1024, 1024), + "1k8k": (1024, 8192), + "8k1k": (8192, 1024) +} + +# Reverse mapping for exp-name generation +seq_len_itos = {v: k for k, v in seq_len_stoi.items()} + + +def seq_len_to_str(isl: int, osl: int) -> str: + """Convert sequence lengths to short string representation. + + Returns the short name (e.g., '1k1k') if it exists in the mapping, + otherwise returns 'isl_osl' format. + """ + return seq_len_itos.get((isl, osl), f"{isl}_{osl}") + +def mark_eval_entries(matrix_values: list[dict]) -> list[dict]: + """Mark entries that should run evaluation. + + For each unique (model, runner, framework, precision, isl, osl) combination: + - Mark highest TP with highest conc + - Mark lowest TP with highest conc + """ + from collections import defaultdict + + # Group entries by (model, runner, framework, precision, isl, osl) + # This ensures we compare within the same configuration, not across different frameworks + groups = defaultdict(list) + for i, entry in enumerate(matrix_values): + key = ( + entry[Fields.MODEL.value], + entry[Fields.RUNNER.value], + entry[Fields.FRAMEWORK.value], + entry[Fields.PRECISION.value], + entry[Fields.ISL.value], + entry[Fields.OSL.value] + ) + groups[key].append((i, entry)) + + # For each group, find highest TP/highest conc and lowest TP/highest conc + eval_indices = set() + for key, entries in groups.items(): + if not entries: + continue + + # Find min and max TP values + min_tp = min(e[Fields.TP.value] for _, e in entries) + max_tp = max(e[Fields.TP.value] for _, e in entries) + + # Find highest conc for highest TP + highest_tp_entries = [(i, e) for i, e in entries if e[Fields.TP.value] == max_tp] + if highest_tp_entries: + max_conc_highest_tp = max(e[Fields.CONC.value] for _, e in highest_tp_entries) + for i, e in highest_tp_entries: + if e[Fields.CONC.value] == max_conc_highest_tp: + eval_indices.add(i) + + # Find highest conc for lowest TP (only if different from max_tp) + if min_tp != max_tp: + lowest_tp_entries = [(i, e) for i, e in entries if e[Fields.TP.value] == min_tp] + if lowest_tp_entries: + max_conc_lowest_tp = max(e[Fields.CONC.value] for _, e in lowest_tp_entries) + for i, e in lowest_tp_entries: + if e[Fields.CONC.value] == max_conc_lowest_tp: + eval_indices.add(i) + + # Mark the selected entries + for i, entry in enumerate(matrix_values): + entry[Fields.FIELD_RUN_EVAL.value] = i in eval_indices + + return matrix_values + + +def generate_full_sweep(args, all_config_data, runner_data): + """Generate full sweep configurations with optional filtering. + + Supports filtering by model prefix, precision, framework, runner type, sequence lengths, + and max concurrency. + + All filters are optional - can generate sweeps for all configs or filter by specific criteria. + + Assumes all_config_data has been validated by validate_master_config(). + """ + # Validate runner types if specified + if args.runner_type: + valid_runner_types = set(runner_data.keys()) + invalid_runners = set(args.runner_type) - valid_runner_types + if invalid_runners: + raise ValueError( + f"Invalid runner type(s): {invalid_runners}. " + f"Valid runner types are: {', '.join(sorted(valid_runner_types))}") + + matrix_values = [] + + # Convert seq-lens to set of (isl, osl) tuples for filtering + seq_lens_filter = None + if args.seq_lens: + seq_lens_filter = {seq_len_stoi[sl] for sl in args.seq_lens} + + # Iterate through all configurations and apply filters as specified (this is just "selecting" + # configs from all of the master configs subject to some pattern matching) + for key, val in all_config_data.items(): + # Filter by model prefix if specified + if args.model_prefix: + if not any(key.startswith(prefix) for prefix in args.model_prefix): + continue + + # Filter by precision if specified + if args.precision and val[Fields.PRECISION.value] not in args.precision: + continue + + # Filter by framework if specified + if args.framework and val[Fields.FRAMEWORK.value] not in args.framework: + continue + + # Filter by runner type if specified + if args.runner_type and val[Fields.RUNNER.value] not in args.runner_type: + continue + + # Check if this is a multinode config + is_multinode = val.get(Fields.MULTINODE.value, False) + # Get disagg value, defaulting to False if not specified + disagg = val.get(Fields.DISAGG.value, False) + + seq_len_configs = val[Fields.SEQ_LEN_CONFIGS.value] + image = val[Fields.IMAGE.value] + model = val[Fields.MODEL.value] + precision = val[Fields.PRECISION.value] + framework = val[Fields.FRAMEWORK.value] + runner = val[Fields.RUNNER.value] + model_code = val[Fields.MODEL_PREFIX.value] + + for seq_config in seq_len_configs: + isl = seq_config[Fields.ISL.value] + osl = seq_config[Fields.OSL.value] + + # Filter by sequence lengths if specified + if seq_lens_filter and (isl, osl) not in seq_lens_filter: + continue + + bmk_space = seq_config[Fields.SEARCH_SPACE.value] + + for bmk in bmk_space: + if is_multinode: + # Skip multinode configs when --single-node is specified + if not args.multi_node: + continue + + # Multinode configuration + # spec_decoding defaults to "none" if not specified + spec_decoding = bmk.get(Fields.SPEC_DECODING.value, "none") + + prefill = bmk[Fields.PREFILL.value] + decode = bmk[Fields.DECODE.value] + + # Get concurrency values (can be list or range) + conc_list = bmk.get(Fields.CONC_LIST.value) + # If it's a list + if conc_list: + conc_values = conc_list + # If it's a range + else: + conc_start = bmk[Fields.CONC_START.value] + conc_end = bmk[Fields.CONC_END.value] + conc_values = [] + conc = conc_start + while conc <= conc_end: + conc_values.append(conc) + if conc == conc_end: + break + conc *= args.step_size + if conc > conc_end: + conc = conc_end + + # Apply max-conc filter if specified + # If max_conc is less than all values, use max_conc directly (if valid) + if args.max_conc is not None: + filtered_conc = [c for c in conc_values if c <= args.max_conc] + if not filtered_conc: + # No existing values <= max_conc, so use max_conc directly if valid + if args.max_conc > 0: + conc_values = [args.max_conc] + else: + continue # Skip if max_conc is not positive + else: + conc_values = filtered_conc + + # For multinode, create a single entry with conc as a list + seq_len_str = seq_len_to_str(isl, osl) + entry = { + Fields.IMAGE.value: image, + Fields.MODEL.value: model, + Fields.MODEL_PREFIX.value: model_code, + Fields.PRECISION.value: precision, + Fields.FRAMEWORK.value: framework, + Fields.RUNNER.value: runner, + Fields.ISL.value: isl, + Fields.OSL.value: osl, + Fields.SPEC_DECODING.value: spec_decoding, + Fields.PREFILL.value: prefill, + Fields.DECODE.value: decode, + Fields.CONC.value: conc_values, # Pass the entire list for multinode + Fields.MAX_MODEL_LEN.value: isl + osl + 200, + Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}", + Fields.DISAGG.value: disagg, + } + + validate_matrix_entry(entry, is_multinode) + matrix_values.append(entry) + elif args.single_node: + # Single-node configuration + tp = bmk[Fields.TP.value] + conc_start = bmk[Fields.CONC_START.value] + conc_end = bmk[Fields.CONC_END.value] + ep = bmk.get(Fields.EP.value) + dp_attn = bmk.get(Fields.DP_ATTN.value) + spec_decoding = bmk.get(Fields.SPEC_DECODING.value, "none") + + # Apply max-tp filter if specified + # If tp > max_tp, use max_tp instead of skipping (if valid) + if args.max_tp is not None: + if args.max_tp <= 0: + continue # Skip if max_tp is not positive + if tp > args.max_tp: + tp = args.max_tp + + # Apply max-ep filter if specified + # If ep > max_ep, use max_ep instead of skipping (if valid) + if args.max_ep is not None: + if args.max_ep <= 0: + continue # Skip if max_ep is not positive + if ep is not None and ep > args.max_ep: + ep = args.max_ep + + # Apply max-conc filter if specified + # If conc_start > max_conc, use max_conc as both start and end (if valid) + if args.max_conc is not None: + if args.max_conc <= 0: + continue # Skip if max_conc is not positive + if conc_start > args.max_conc: + conc_start = args.max_conc + conc_end = args.max_conc + else: + conc_end = min(conc_end, args.max_conc) + + conc = conc_start + while conc <= conc_end: + seq_len_str = seq_len_to_str(isl, osl) + entry = { + Fields.IMAGE.value: image, + Fields.MODEL.value: model, + Fields.MODEL_PREFIX.value: model_code, + Fields.PRECISION.value: precision, + Fields.FRAMEWORK.value: framework, + Fields.RUNNER.value: runner, + Fields.ISL.value: isl, + Fields.OSL.value: osl, + Fields.TP.value: tp, + Fields.CONC.value: conc, + Fields.MAX_MODEL_LEN.value: isl + osl + 200, + Fields.EP.value: 1, # Default + Fields.DP_ATTN.value: False, # Default + Fields.SPEC_DECODING.value: spec_decoding, + Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}", + Fields.DISAGG.value: disagg, + } + + if ep is not None: + entry[Fields.EP.value] = ep + if dp_attn is not None: + entry[Fields.DP_ATTN.value] = dp_attn + + validate_matrix_entry(entry, is_multinode) + matrix_values.append(entry) + + if conc == conc_end: + break + conc *= args.step_size + if conc > conc_end: + conc = conc_end + + return matrix_values + + +def generate_runner_model_sweep_config(args, all_config_data, runner_data): + """Generate runner-model sweep configurations. + + Assumes all_config_data has been validated by validate_config_structure(). + Supports both single-node and multinode configurations. + """ + runner_nodes = runner_data.get(args.runner_type) + + if not runner_nodes: + raise ValueError( + f"Runner '{args.runner_type}' does not exist in runner config '{args.runner_config}'. Must choose from existing runner types: '{', '.join(runner_data.keys())}'.") + + # Filter runner nodes if filter is specified + if args.runner_node_filter: + runner_nodes = [ + node for node in runner_nodes if args.runner_node_filter in node] + if not runner_nodes: + raise ValueError( + f"No runner nodes found matching filter '{args.runner_node_filter}' for runner type '{args.runner_type}'.") + + matrix_values = [] + for key, val in all_config_data.items(): + # Only consider configs with specified runner + if val[Fields.RUNNER.value] != args.runner_type: + continue + + is_multinode = val.get(Fields.MULTINODE.value, False) + + # Skip configs that don't match the requested node type + if args.single_node and is_multinode: + continue + if args.multi_node and not is_multinode: + continue + + # Get model code for exp_name + model_code = val[Fields.MODEL_PREFIX.value] + # Get disagg value, defaulting to False if not specified + disagg = val.get(Fields.DISAGG.value, False) + + # Find 1k1k config + target_config = None + for config in val[Fields.SEQ_LEN_CONFIGS.value]: + if config[Fields.ISL.value] == 1024 and config[Fields.OSL.value] == 1024: + target_config = config + break + + if target_config is None: + continue + + if is_multinode: + # For multinode, find the search space entry with the lowest concurrency + def get_lowest_conc(search_space_entry): + conc_list = search_space_entry.get(Fields.CONC_LIST.value, []) + return min(conc_list) if conc_list else float('inf') + + lowest_conc_entry = min( + target_config[Fields.SEARCH_SPACE.value], key=get_lowest_conc) + + conc_list = lowest_conc_entry.get(Fields.CONC_LIST.value, []) + lowest_conc = min(conc_list) if conc_list else 1 + + spec_decoding = lowest_conc_entry.get( + Fields.SPEC_DECODING.value, "none") + prefill_config = lowest_conc_entry[Fields.PREFILL.value] + decode_config = lowest_conc_entry[Fields.DECODE.value] + + for node in runner_nodes: + entry = { + Fields.IMAGE.value: val[Fields.IMAGE.value], + Fields.MODEL.value: val[Fields.MODEL.value], + Fields.MODEL_PREFIX.value: model_code, + Fields.PRECISION.value: val[Fields.PRECISION.value], + Fields.FRAMEWORK.value: val[Fields.FRAMEWORK.value], + Fields.RUNNER.value: node, + Fields.ISL.value: 1024, + Fields.OSL.value: 1024, + Fields.SPEC_DECODING.value: spec_decoding, + Fields.PREFILL.value: { + Fields.NUM_WORKER.value: prefill_config[Fields.NUM_WORKER.value], + Fields.TP.value: prefill_config[Fields.TP.value], + Fields.EP.value: prefill_config[Fields.EP.value], + Fields.DP_ATTN.value: prefill_config[Fields.DP_ATTN.value], + Fields.ADDITIONAL_SETTINGS.value: prefill_config.get(Fields.ADDITIONAL_SETTINGS.value, []), + }, + Fields.DECODE.value: { + Fields.NUM_WORKER.value: decode_config[Fields.NUM_WORKER.value], + Fields.TP.value: decode_config[Fields.TP.value], + Fields.EP.value: decode_config[Fields.EP.value], + Fields.DP_ATTN.value: decode_config[Fields.DP_ATTN.value], + Fields.ADDITIONAL_SETTINGS.value: decode_config.get(Fields.ADDITIONAL_SETTINGS.value, []), + }, + Fields.CONC.value: [lowest_conc], + Fields.MAX_MODEL_LEN.value: 2048, + Fields.EXP_NAME.value: f"{model_code}_test", + Fields.DISAGG.value: disagg, + } + matrix_values.append(validate_matrix_entry(entry, is_multinode=True)) + else: + # Single-node: pick highest TP config with lowest concurrency + highest_tp_bmk = max( + target_config[Fields.SEARCH_SPACE.value], key=lambda x: x[Fields.TP.value]) + highest_tp = highest_tp_bmk[Fields.TP.value] + lowest_conc = highest_tp_bmk[Fields.CONC_START.value] + + ep = highest_tp_bmk.get(Fields.EP.value) + dp_attn = highest_tp_bmk.get(Fields.DP_ATTN.value) + spec_decoding = highest_tp_bmk.get(Fields.SPEC_DECODING.value, "none") + + for node in runner_nodes: + entry = { + Fields.IMAGE.value: val[Fields.IMAGE.value], + Fields.MODEL.value: val[Fields.MODEL.value], + Fields.MODEL_PREFIX.value: model_code, + Fields.PRECISION.value: val[Fields.PRECISION.value], + Fields.FRAMEWORK.value: val[Fields.FRAMEWORK.value], + Fields.RUNNER.value: node, + Fields.ISL.value: 1024, + Fields.OSL.value: 1024, + Fields.TP.value: highest_tp, + Fields.EP.value: ep if ep is not None else 1, + Fields.DP_ATTN.value: dp_attn if dp_attn is not None else False, + Fields.SPEC_DECODING.value: spec_decoding, + Fields.CONC.value: lowest_conc, + Fields.MAX_MODEL_LEN.value: 2048, + Fields.EXP_NAME.value: f"{model_code}_test", + Fields.DISAGG.value: disagg, + } + matrix_values.append(validate_matrix_entry(entry, is_multinode=False)) + + return matrix_values + + +def generate_test_config_sweep(args, all_config_data): + """Generate full sweep for specific config keys. + + Validates that all specified config keys exist before generating. + Expands all configs fully without any filtering. + """ + # Validate all config keys exist + missing_keys = [key for key in args.config_keys if key not in all_config_data] + if missing_keys: + available_keys = sorted(all_config_data.keys()) + raise ValueError( + f"Config key(s) not found: {', '.join(missing_keys)}.\n" + f"Available keys: {', '.join(available_keys)}" + ) + + matrix_values = [] + + for key in args.config_keys: + val = all_config_data[key] + is_multinode = val.get(Fields.MULTINODE.value, False) + + image = val[Fields.IMAGE.value] + model = val[Fields.MODEL.value] + model_code = val[Fields.MODEL_PREFIX.value] + precision = val[Fields.PRECISION.value] + framework = val[Fields.FRAMEWORK.value] + runner = val[Fields.RUNNER.value] + disagg = val.get(Fields.DISAGG.value, False) + + for seq_len_config in val[Fields.SEQ_LEN_CONFIGS.value]: + isl = seq_len_config[Fields.ISL.value] + osl = seq_len_config[Fields.OSL.value] + seq_len_str = seq_len_to_str(isl, osl) + + for bmk in seq_len_config[Fields.SEARCH_SPACE.value]: + if is_multinode: + # Multinode config + spec_decoding = bmk.get(Fields.SPEC_DECODING.value, "none") + prefill = bmk[Fields.PREFILL.value] + decode = bmk[Fields.DECODE.value] + + # Get concurrency values + if Fields.CONC_LIST.value in bmk: + conc_values = bmk[Fields.CONC_LIST.value] + else: + conc_start = bmk[Fields.CONC_START.value] + conc_end = bmk[Fields.CONC_END.value] + conc_values = [] + conc = conc_start + while conc <= conc_end: + conc_values.append(conc) + if conc == conc_end: + break + conc *= 2 + if conc > conc_end: + conc = conc_end + + entry = { + Fields.IMAGE.value: image, + Fields.MODEL.value: model, + Fields.MODEL_PREFIX.value: model_code, + Fields.PRECISION.value: precision, + Fields.FRAMEWORK.value: framework, + Fields.RUNNER.value: runner, + Fields.ISL.value: isl, + Fields.OSL.value: osl, + Fields.SPEC_DECODING.value: spec_decoding, + Fields.PREFILL.value: prefill, + Fields.DECODE.value: decode, + Fields.CONC.value: conc_values, + Fields.MAX_MODEL_LEN.value: isl + osl + 200, + Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}", + Fields.DISAGG.value: disagg, + } + matrix_values.append(validate_matrix_entry(entry, is_multinode=True)) + else: + # Single-node config + tp = bmk[Fields.TP.value] + ep = bmk.get(Fields.EP.value) + dp_attn = bmk.get(Fields.DP_ATTN.value) + spec_decoding = bmk.get(Fields.SPEC_DECODING.value, "none") + + # Get concurrency values + if Fields.CONC_LIST.value in bmk: + conc_values = bmk[Fields.CONC_LIST.value] + else: + conc_start = bmk[Fields.CONC_START.value] + conc_end = bmk[Fields.CONC_END.value] + conc_values = [] + conc = conc_start + while conc <= conc_end: + conc_values.append(conc) + if conc == conc_end: + break + conc *= 2 + if conc > conc_end: + conc = conc_end + + for conc in conc_values: + entry = { + Fields.IMAGE.value: image, + Fields.MODEL.value: model, + Fields.MODEL_PREFIX.value: model_code, + Fields.PRECISION.value: precision, + Fields.FRAMEWORK.value: framework, + Fields.RUNNER.value: runner, + Fields.ISL.value: isl, + Fields.OSL.value: osl, + Fields.TP.value: tp, + Fields.CONC.value: conc, + Fields.MAX_MODEL_LEN.value: isl + osl + 200, + Fields.EP.value: ep if ep is not None else 1, + Fields.DP_ATTN.value: dp_attn if dp_attn is not None else False, + Fields.SPEC_DECODING.value: spec_decoding, + Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}", + Fields.DISAGG.value: disagg, + } + matrix_values.append(validate_matrix_entry(entry, is_multinode=False)) + + return matrix_values + + +def main(): + # Create parent parser with common arguments + parent_parser = argparse.ArgumentParser(add_help=False) + parent_parser.add_argument( + '--config-files', + nargs='+', + required=True, + help='One or more configuration files (YAML format)' + ) + parent_parser.add_argument( + '--runner-config', + required=True, + help='Configuration file holding runner information (YAML format)' + ) + parent_parser.add_argument( + '--run-evals', + action='store_true', + required=False, + help='When specifiedm run evals on a subset of configs.' + ) + + # Create main parser + parser = argparse.ArgumentParser( + description='Generate benchmark configurations from YAML config files' + ) + + # Create subparsers for subcommands + subparsers = parser.add_subparsers( + dest='command', + required=True, + help='Available commands' + ) + + # Subcommand: full-sweep + full_sweep_parser = subparsers.add_parser( + 'full-sweep', + parents=[parent_parser], + add_help=False, + help='Generate full sweep configurations with optional filtering by model, precision, framework, runner type, and sequence lengths' + ) + full_sweep_parser.add_argument( + '--model-prefix', + nargs='+', + required=False, + help='Model prefix(es) to filter configurations (optional, can specify multiple)' + ) + full_sweep_parser.add_argument( + '--precision', + nargs='+', + required=False, + help='Precision(s) to filter by (e.g., fp4, fp8) (optional, can specify multiple)' + ) + full_sweep_parser.add_argument( + '--framework', + nargs='+', + required=False, + help='Framework(s) to filter by (e.g., vllm, trt, sglang) (optional, can specify multiple)' + ) + full_sweep_parser.add_argument( + '--runner-type', + nargs='+', + required=False, + help='Runner type(s) to filter by (e.g., h200, h100) (optional, can specify multiple)' + ) + full_sweep_parser.add_argument( + '--seq-lens', + nargs='+', + choices=list(seq_len_stoi.keys()), + required=False, + help=f"Sequence length configurations to include: {', '.join(seq_len_stoi.keys())}. If not specified, all sequence lengths are included." + ) + full_sweep_parser.add_argument( + '--step-size', + type=int, + default=2, + help='Step size for concurrency values (default: 2)' + ) + full_sweep_parser.add_argument( + '--max-conc', + type=int, + required=False, + help='Maximum concurrency value to include (filters out higher concurrency values)' + ) + full_sweep_parser.add_argument( + '--max-tp', + type=int, + required=False, + help='Maximum tensor parallelism value to include (single-node only)' + ) + full_sweep_parser.add_argument( + '--max-ep', + type=int, + required=False, + help='Maximum expert parallelism value to include (single-node only)' + ) + node_type_group = full_sweep_parser.add_mutually_exclusive_group(required=True) + node_type_group.add_argument( + '--single-node', + action='store_true', + help='Only generate single-node configurations' + ) + node_type_group.add_argument( + '--multi-node', + action='store_true', + help='Only generate multi-node configurations' + ) + full_sweep_parser.add_argument( + '-h', '--help', + action='help', + help='Show this help message and exit' + ) + + # Subcommand: runner-model-sweep + test_config_parser = subparsers.add_parser( + 'runner-model-sweep', + parents=[parent_parser], + add_help=False, + help='Given a runner type, find all configurations matching the type, and run that configuration on all individual runner nodes for the specified runner type. This is meant to validate that all runner nodes work on all configurations for a runner type. For instance, to validate that all configs that specify an h200 runner successfully run across all h200 runner nodes.' + ) + test_config_parser.add_argument( + '--runner-type', + required=True, + help='Runner type (e.g., b200-trt, h100)' + ) + test_config_parser.add_argument( + '--runner-node-filter', + required=False, + help='Filter runner nodes by substring match (e.g., "mi300x-amd" to only include nodes containing that string)' + ) + test_node_group = test_config_parser.add_mutually_exclusive_group( + required=True) + test_node_group.add_argument( + '--single-node', + action='store_true', + help='Generate single-node configurations only' + ) + test_node_group.add_argument( + '--multi-node', + action='store_true', + help='Generate multi-node configurations only' + ) + test_config_parser.add_argument( + '-h', '--help', + action='help', + help='Show this help message and exit' + ) + + # Subcommand: test-config + test_config_keys_parser = subparsers.add_parser( + 'test-config', + parents=[parent_parser], + add_help=False, + help='Generate full sweep for specific config keys. Validates that all specified keys exist before generating.' + ) + test_config_keys_parser.add_argument( + '--config-keys', + nargs='+', + required=True, + help='One or more config keys to generate sweep for (e.g., dsr1-fp4-b200-sglang dsr1-fp8-h200-trt)' + ) + test_config_keys_parser.add_argument( + '-h', '--help', + action='help', + help='Show this help message and exit' + ) + + args = parser.parse_args() + + # Load and validate configuration files (validation happens by default in load functions) + all_config_data = load_config_files(args.config_files) + runner_data = load_runner_file(args.runner_config) + + # Route to appropriate function based on subcommand + if args.command == 'full-sweep': + matrix_values = generate_full_sweep(args, all_config_data, runner_data) + elif args.command == 'runner-model-sweep': + matrix_values = generate_runner_model_sweep_config( + args, all_config_data, runner_data) + elif args.command == 'test-config': + matrix_values = generate_test_config_sweep(args, all_config_data) + else: + parser.error(f"Unknown command: {args.command}") + + # Choose eval (opt-in via --run-evals) + if args.run_evals: + matrix_values = mark_eval_entries(matrix_values) + + print(json.dumps(matrix_values)) + return matrix_values + + +if __name__ == "__main__": + main() diff --git a/utils/matrix-logic/pytest.ini b/utils/matrix_logic/pytest.ini similarity index 100% rename from utils/matrix-logic/pytest.ini rename to utils/matrix_logic/pytest.ini diff --git a/utils/matrix_logic/test_generate_sweep_configs.py b/utils/matrix_logic/test_generate_sweep_configs.py new file mode 100644 index 000000000..c505611c3 --- /dev/null +++ b/utils/matrix_logic/test_generate_sweep_configs.py @@ -0,0 +1,948 @@ +"""Comprehensive tests for generate_sweep_configs.py""" +import pytest +import argparse +from generate_sweep_configs import ( + seq_len_stoi, + seq_len_itos, + seq_len_to_str, + generate_full_sweep, + generate_runner_model_sweep_config, +) + + +# ============================================================================= +# Test Fixtures +# ============================================================================= + +@pytest.fixture +def sample_single_node_config(): + """Single node config based on dsr1-fp8-mi300x-sglang.""" + return { + "dsr1-fp8-mi300x-sglang": { + "image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915", + "model": "deepseek-ai/DeepSeek-R1-0528", + "model-prefix": "dsr1", + "precision": "fp8", + "framework": "sglang", + "runner": "mi300x", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + }, + { + "isl": 8192, + "osl": 1024, + "search-space": [ + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + } + ] + } + } + + +@pytest.fixture +def sample_multinode_config(): + """Multinode config based on dsr1-fp4-gb200-dynamo-trt.""" + return { + "dsr1-fp4-gb200-dynamo-trt": { + "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", + "model": "deepseek-r1-fp4", + "model-prefix": "dsr1", + "precision": "fp4", + "framework": "dynamo-trt", + "runner": "gb200", + "multinode": True, + "disagg": True, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + { + "conc-list": [2150], + "prefill": { + "num-worker": 5, + "tp": 4, + "ep": 4, + "dp-attn": True, + "additional-settings": [ + "PREFILL_MAX_NUM_TOKENS=8448", + "PREFILL_MAX_BATCH_SIZE=1", + ], + }, + "decode": { + "num-worker": 1, + "tp": 8, + "ep": 8, + "dp-attn": True, + "additional-settings": [ + "DECODE_MAX_NUM_TOKENS=256", + "DECODE_MAX_BATCH_SIZE=256", + ], + }, + } + ] + } + ] + } + } + + +@pytest.fixture +def sample_runner_config(): + """Runner config based on .github/configs/runners.yaml.""" + return { + "h100": ["h100-cr_0", "h100-cr_1", "h100-cw_0", "h100-cw_1"], + "h200": ["h200-cw_0", "h200-cw_1", "h200-nb_0", "h200-nb_1"], + "b200": ["b200-nvd_0", "b200-nvd_1", "b200-dgxc_1"], + "mi300x": ["mi300x-amd_0", "mi300x-amd_1", "mi300x-cr_0"], + "gb200": ["gb200-nv_0"], + } + + +@pytest.fixture +def full_sweep_args_single_node(): + """Args for full-sweep single-node command.""" + args = argparse.Namespace() + args.model_prefix = None + args.precision = None + args.framework = None + args.runner_type = None + args.seq_lens = None + args.step_size = 2 + args.max_conc = None + args.max_tp = None + args.max_ep = None + args.single_node = True + args.multi_node = False + return args + + +@pytest.fixture +def full_sweep_args_multi_node(): + """Args for full-sweep multi-node command.""" + args = argparse.Namespace() + args.model_prefix = None + args.precision = None + args.framework = None + args.runner_type = None + args.seq_lens = None + args.step_size = 2 + args.max_conc = None + args.max_tp = None + args.max_ep = None + args.single_node = False + args.multi_node = True + return args + + +# ============================================================================= +# Test seq_len mappings +# ============================================================================= + +class TestSeqLenMappings: + """Tests for sequence length string mappings.""" + + def test_seq_len_stoi_values(self): + """Verify seq_len_stoi has expected mappings.""" + assert seq_len_stoi["1k1k"] == (1024, 1024) + assert seq_len_stoi["1k8k"] == (1024, 8192) + assert seq_len_stoi["8k1k"] == (8192, 1024) + + def test_seq_len_itos_reverse_mapping(self): + """Verify seq_len_itos is reverse of stoi.""" + assert seq_len_itos[(1024, 1024)] == "1k1k" + assert seq_len_itos[(1024, 8192)] == "1k8k" + assert seq_len_itos[(8192, 1024)] == "8k1k" + + +class TestSeqLenToStr: + """Tests for seq_len_to_str function.""" + + def test_known_sequence_lengths(self): + """Known sequence lengths should return short name.""" + assert seq_len_to_str(1024, 1024) == "1k1k" + assert seq_len_to_str(1024, 8192) == "1k8k" + assert seq_len_to_str(8192, 1024) == "8k1k" + + def test_unknown_sequence_lengths(self): + """Unknown sequence lengths should return isl_osl format.""" + assert seq_len_to_str(2048, 2048) == "2048_2048" + assert seq_len_to_str(4096, 1024) == "4096_1024" + + +# ============================================================================= +# Test generate_full_sweep for single-node +# ============================================================================= + +class TestGenerateFullSweepSingleNode: + """Tests for generate_full_sweep with single-node configs.""" + + def test_basic_sweep_generation(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """Basic single-node sweep should generate entries.""" + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + assert len(result) > 0 + # With step_size=2, conc goes 4, 8, 16, 32, 64 = 5 values per seq-len config + # 2 seq-len configs * 5 = 10 entries + assert len(result) == 10 + + def test_matrix_entry_structure(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """Generated entries should have correct structure.""" + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + entry = result[0] + assert entry["image"] == "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915" + assert entry["model"] == "deepseek-ai/DeepSeek-R1-0528" + assert entry["precision"] == "fp8" + assert entry["framework"] == "sglang" + assert entry["runner"] == "mi300x" + assert entry["tp"] == 8 + assert "exp-name" in entry + assert "max-model-len" in entry + + def test_filter_by_model_prefix(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """Filter by model prefix should work.""" + full_sweep_args_single_node.model_prefix = ["dsr1"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + assert len(result) > 0 + + # Non-matching prefix should return empty + full_sweep_args_single_node.model_prefix = ["nonexistent"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + assert len(result) == 0 + + def test_filter_by_precision(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """Filter by precision should work.""" + full_sweep_args_single_node.precision = ["fp8"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + assert len(result) > 0 + + full_sweep_args_single_node.precision = ["fp4"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + assert len(result) == 0 + + def test_filter_by_framework(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """Filter by framework should work.""" + full_sweep_args_single_node.framework = ["sglang"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + assert len(result) > 0 + + full_sweep_args_single_node.framework = ["vllm"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + assert len(result) == 0 + + def test_filter_by_runner_type(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """Filter by runner type should work.""" + full_sweep_args_single_node.runner_type = ["mi300x"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + assert len(result) > 0 + + full_sweep_args_single_node.runner_type = ["h100"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + assert len(result) == 0 + + def test_invalid_runner_type_raises_error(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """Invalid runner type should raise ValueError.""" + full_sweep_args_single_node.runner_type = ["invalid_runner"] + with pytest.raises(ValueError) as exc_info: + generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + assert "Invalid runner type" in str(exc_info.value) + + def test_filter_by_seq_lens(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """Filter by sequence lengths should work.""" + full_sweep_args_single_node.seq_lens = ["1k1k"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + # Only 1k1k entries, 5 concurrency values + assert len(result) == 5 + assert all(entry["isl"] == 1024 and entry["osl"] == 1024 for entry in result) + + def test_max_conc_filter(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """max_conc filter should limit concurrency values.""" + full_sweep_args_single_node.max_conc = 16 + full_sweep_args_single_node.seq_lens = ["1k1k"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + # conc values: 4, 8, 16 (32, 64 filtered out) + assert len(result) == 3 + assert all(entry["conc"] <= 16 for entry in result) + + def test_max_conc_creates_config_when_below_min(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """max_conc below config's min should create config with max_conc value.""" + # Config has conc-start=4, so max_conc=1 should create entry with conc=1 + full_sweep_args_single_node.max_conc = 1 + full_sweep_args_single_node.seq_lens = ["1k1k"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + # Should create 1 entry with conc=1 + assert len(result) == 1 + assert result[0]["conc"] == 1 + + def test_max_conc_zero_or_negative_skips(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """max_conc of 0 or negative should skip configs.""" + for invalid_value in [0, -1, -100]: + full_sweep_args_single_node.max_conc = invalid_value + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + assert len(result) == 0, f"Expected 0 results for max_conc={invalid_value}" + + def test_max_tp_filter(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """max_tp filter should use max_tp when config tp exceeds it.""" + full_sweep_args_single_node.max_tp = 4 + full_sweep_args_single_node.seq_lens = ["1k1k"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + # tp=8 in config, but max_tp=4, so should use tp=4 + assert len(result) > 0 + assert all(entry["tp"] == 4 for entry in result) + + def test_max_tp_creates_config_when_below_min(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """max_tp below config's tp should create config with max_tp value.""" + # Config has tp=8, so max_tp=2 should create entries with tp=2 + full_sweep_args_single_node.max_tp = 2 + full_sweep_args_single_node.seq_lens = ["1k1k"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + assert len(result) > 0 + assert all(entry["tp"] == 2 for entry in result) + + def test_max_tp_zero_or_negative_skips(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """max_tp of 0 or negative should skip configs.""" + for invalid_value in [0, -1, -100]: + full_sweep_args_single_node.max_tp = invalid_value + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + assert len(result) == 0, f"Expected 0 results for max_tp={invalid_value}" + + def test_step_size(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """Different step sizes should affect concurrency progression.""" + full_sweep_args_single_node.step_size = 4 + full_sweep_args_single_node.seq_lens = ["1k1k"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + # conc: 4, 16, 64 = 3 values + assert len(result) == 3 + conc_values = [entry["conc"] for entry in result] + assert 4 in conc_values + assert 16 in conc_values + assert 64 in conc_values + + def test_exp_name_format(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """exp-name should have correct format.""" + full_sweep_args_single_node.seq_lens = ["1k1k"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + assert all(entry["exp-name"] == "dsr1_1k1k" for entry in result) + + def test_max_model_len_calculation(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """max-model-len should be isl + osl + 200.""" + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + for entry in result: + expected_max_model_len = entry["isl"] + entry["osl"] + 200 + assert entry["max-model-len"] == expected_max_model_len + + +# ============================================================================= +# Test generate_full_sweep for multi-node +# ============================================================================= + +class TestGenerateFullSweepMultiNode: + """Tests for generate_full_sweep with multi-node configs.""" + + def test_multinode_sweep_generation(self, sample_multinode_config, sample_runner_config, full_sweep_args_multi_node): + """Multinode sweep should generate entries with prefill/decode.""" + result = generate_full_sweep( + full_sweep_args_multi_node, + sample_multinode_config, + sample_runner_config + ) + assert len(result) == 1 # One entry with conc-list + + def test_multinode_entry_structure(self, sample_multinode_config, sample_runner_config, full_sweep_args_multi_node): + """Multinode entries should have prefill and decode configs.""" + result = generate_full_sweep( + full_sweep_args_multi_node, + sample_multinode_config, + sample_runner_config + ) + entry = result[0] + assert "prefill" in entry + assert "decode" in entry + assert entry["prefill"]["num-worker"] == 5 + assert entry["decode"]["num-worker"] == 1 + assert entry["disagg"] is True + + def test_multinode_conc_as_list(self, sample_multinode_config, sample_runner_config, full_sweep_args_multi_node): + """Multinode conc should be passed as list.""" + result = generate_full_sweep( + full_sweep_args_multi_node, + sample_multinode_config, + sample_runner_config + ) + entry = result[0] + assert isinstance(entry["conc"], list) + assert entry["conc"] == [2150] + + def test_single_node_flag_skips_multinode(self, sample_multinode_config, sample_runner_config, full_sweep_args_single_node): + """Single-node flag should skip multinode configs.""" + result = generate_full_sweep( + full_sweep_args_single_node, + sample_multinode_config, + sample_runner_config + ) + assert len(result) == 0 + + +# ============================================================================= +# Test generate_runner_model_sweep_config +# ============================================================================= + +class TestGenerateRunnerModelSweepConfig: + """Tests for generate_runner_model_sweep_config function.""" + + @pytest.fixture + def runner_sweep_args(self): + """Args for runner-model-sweep command (single-node).""" + args = argparse.Namespace() + args.runner_type = "mi300x" + args.runner_config = "runners.yaml" + args.runner_node_filter = None + args.single_node = True + args.multi_node = False + return args + + def test_basic_runner_sweep(self, sample_single_node_config, sample_runner_config, runner_sweep_args): + """Basic runner sweep should generate entries for each node.""" + result = generate_runner_model_sweep_config( + runner_sweep_args, + sample_single_node_config, + sample_runner_config + ) + # 3 mi300x nodes + assert len(result) == 3 + + def test_runner_sweep_entry_structure(self, sample_single_node_config, sample_runner_config, runner_sweep_args): + """Runner sweep entries should use 1k1k config.""" + result = generate_runner_model_sweep_config( + runner_sweep_args, + sample_single_node_config, + sample_runner_config + ) + for entry in result: + assert entry["isl"] == 1024 + assert entry["osl"] == 1024 + assert entry["max-model-len"] == 2048 + assert "_test" in entry["exp-name"] + + def test_each_node_gets_entry(self, sample_single_node_config, sample_runner_config, runner_sweep_args): + """Each runner node should get its own entry.""" + result = generate_runner_model_sweep_config( + runner_sweep_args, + sample_single_node_config, + sample_runner_config + ) + runners = [entry["runner"] for entry in result] + assert "mi300x-amd_0" in runners + assert "mi300x-amd_1" in runners + assert "mi300x-cr_0" in runners + + def test_invalid_runner_type(self, sample_single_node_config, sample_runner_config, runner_sweep_args): + """Invalid runner type should raise error.""" + runner_sweep_args.runner_type = "nonexistent" + with pytest.raises(ValueError) as exc_info: + generate_runner_model_sweep_config( + runner_sweep_args, + sample_single_node_config, + sample_runner_config + ) + assert "does not exist" in str(exc_info.value) + + def test_runner_node_filter(self, sample_single_node_config, sample_runner_config, runner_sweep_args): + """Runner node filter should limit nodes.""" + runner_sweep_args.runner_node_filter = "amd" + result = generate_runner_model_sweep_config( + runner_sweep_args, + sample_single_node_config, + sample_runner_config + ) + # Only mi300x-amd_0 and mi300x-amd_1 match + assert len(result) == 2 + assert all("amd" in entry["runner"] for entry in result) + + def test_runner_node_filter_no_match(self, sample_single_node_config, sample_runner_config, runner_sweep_args): + """Runner node filter with no matches should raise error.""" + runner_sweep_args.runner_node_filter = "nonexistent" + with pytest.raises(ValueError) as exc_info: + generate_runner_model_sweep_config( + runner_sweep_args, + sample_single_node_config, + sample_runner_config + ) + assert "No runner nodes found" in str(exc_info.value) + + def test_uses_highest_tp(self, sample_single_node_config, sample_runner_config, runner_sweep_args): + """Should use highest TP from search space.""" + result = generate_runner_model_sweep_config( + runner_sweep_args, + sample_single_node_config, + sample_runner_config + ) + # Config has tp=8 + assert all(entry["tp"] == 8 for entry in result) + + def test_uses_lowest_conc(self, sample_single_node_config, sample_runner_config, runner_sweep_args): + """Should use lowest concurrency from search space.""" + result = generate_runner_model_sweep_config( + runner_sweep_args, + sample_single_node_config, + sample_runner_config + ) + # Config has conc-start=4 + assert all(entry["conc"] == 4 for entry in result) + + +# ============================================================================= +# Test edge cases and special configurations +# ============================================================================= + +class TestEdgeCases: + """Tests for edge cases and special configurations.""" + + def test_config_with_ep_and_dp_attn(self, sample_runner_config, full_sweep_args_single_node): + """Config with ep and dp-attn should be handled correctly.""" + config = { + "test-config": { + "image": "test-image", + "model": "test-model", + "model-prefix": "test", + "precision": "fp4", + "framework": "sglang", + "runner": "b200", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 4, "ep": 4, "dp-attn": True, "conc-start": 4, "conc-end": 4} + ] + } + ] + } + } + result = generate_full_sweep( + full_sweep_args_single_node, + config, + sample_runner_config + ) + assert len(result) == 1 + assert result[0]["ep"] == 4 + assert result[0]["dp-attn"] is True + + def test_config_with_spec_decoding(self, sample_runner_config, full_sweep_args_single_node): + """Config with spec-decoding should be handled correctly.""" + config = { + "test-config": { + "image": "test-image", + "model": "test-model", + "model-prefix": "test", + "precision": "fp4", + "framework": "trt", + "runner": "b200", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 8, "spec-decoding": "mtp", "conc-start": 4, "conc-end": 4} + ] + } + ] + } + } + result = generate_full_sweep( + full_sweep_args_single_node, + config, + sample_runner_config + ) + assert len(result) == 1 + assert result[0]["spec-decoding"] == "mtp" + + def test_conc_list_in_single_node(self, sample_runner_config, full_sweep_args_single_node): + """Single node config with conc-list should work.""" + config = { + "test-config": { + "image": "test-image", + "model": "test-model", + "model-prefix": "test", + "precision": "fp8", + "framework": "sglang", + "runner": "mi300x", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 8, "conc-start": 4, "conc-end": 16} + ] + } + ] + } + } + result = generate_full_sweep( + full_sweep_args_single_node, + config, + sample_runner_config + ) + conc_values = [entry["conc"] for entry in result] + assert 4 in conc_values + assert 8 in conc_values + assert 16 in conc_values + + def test_disagg_defaults_to_false(self, sample_runner_config, full_sweep_args_single_node): + """disagg should default to False when not specified.""" + config = { + "test-config": { + "image": "test-image", + "model": "test-model", + "model-prefix": "test", + "precision": "fp8", + "framework": "sglang", + "runner": "mi300x", + "multinode": False, + # No disagg field + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 8, "conc-start": 4, "conc-end": 4} + ] + } + ] + } + } + result = generate_full_sweep( + full_sweep_args_single_node, + config, + sample_runner_config + ) + assert result[0]["disagg"] is False + + def test_multinode_conc_range_expansion(self, sample_runner_config, full_sweep_args_multi_node): + """Multinode with conc range should expand to list.""" + config = { + "test-config": { + "image": "test-image", + "model": "test-model", + "model-prefix": "test", + "precision": "fp4", + "framework": "dynamo-trt", + "runner": "gb200", + "multinode": True, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + { + "conc-start": 1, + "conc-end": 8, + "prefill": { + "num-worker": 1, + "tp": 4, + "ep": 4, + "dp-attn": False, + }, + "decode": { + "num-worker": 1, + "tp": 8, + "ep": 8, + "dp-attn": False, + }, + } + ] + } + ] + } + } + result = generate_full_sweep( + full_sweep_args_multi_node, + config, + sample_runner_config + ) + assert len(result) == 1 + # step_size=2: 1, 2, 4, 8 + assert result[0]["conc"] == [1, 2, 4, 8] + + def test_max_ep_creates_config_when_below_min(self, sample_runner_config, full_sweep_args_single_node): + """max_ep below config's ep should create config with max_ep value.""" + config = { + "test-config": { + "image": "test-image", + "model": "test-model", + "model-prefix": "test", + "precision": "fp4", + "framework": "sglang", + "runner": "b200", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 4} + ] + } + ] + } + } + full_sweep_args_single_node.max_ep = 2 + result = generate_full_sweep( + full_sweep_args_single_node, + config, + sample_runner_config + ) + # ep=8 in config, but max_ep=2, so should use ep=2 + assert len(result) == 1 + assert result[0]["ep"] == 2 + + def test_max_ep_zero_or_negative_skips(self, sample_runner_config, full_sweep_args_single_node): + """max_ep of 0 or negative should skip configs.""" + config = { + "test-config": { + "image": "test-image", + "model": "test-model", + "model-prefix": "test", + "precision": "fp4", + "framework": "sglang", + "runner": "b200", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 4} + ] + } + ] + } + } + for invalid_value in [0, -1, -100]: + full_sweep_args_single_node.max_ep = invalid_value + result = generate_full_sweep( + full_sweep_args_single_node, + config, + sample_runner_config + ) + assert len(result) == 0, f"Expected 0 results for max_ep={invalid_value}" + + def test_multinode_max_conc_zero_or_negative_skips(self, sample_runner_config, full_sweep_args_multi_node): + """Multinode max_conc of 0 or negative should skip configs.""" + config = { + "test-config": { + "image": "test-image", + "model": "test-model", + "model-prefix": "test", + "precision": "fp4", + "framework": "dynamo-trt", + "runner": "gb200", + "multinode": True, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + { + "conc-list": [100, 200, 400], + "prefill": { + "num-worker": 1, + "tp": 4, + "ep": 4, + "dp-attn": False, + }, + "decode": { + "num-worker": 1, + "tp": 8, + "ep": 8, + "dp-attn": False, + }, + } + ] + } + ] + } + } + for invalid_value in [0, -1, -100]: + full_sweep_args_multi_node.max_conc = invalid_value + result = generate_full_sweep( + full_sweep_args_multi_node, + config, + sample_runner_config + ) + assert len(result) == 0, f"Expected 0 results for max_conc={invalid_value}" + + def test_multinode_max_conc_creates_config_when_below_min(self, sample_runner_config, full_sweep_args_multi_node): + """Multinode max_conc below all values should create config with max_conc.""" + config = { + "test-config": { + "image": "test-image", + "model": "test-model", + "model-prefix": "test", + "precision": "fp4", + "framework": "dynamo-trt", + "runner": "gb200", + "multinode": True, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + { + "conc-list": [100, 200, 400], + "prefill": { + "num-worker": 1, + "tp": 4, + "ep": 4, + "dp-attn": False, + }, + "decode": { + "num-worker": 1, + "tp": 8, + "ep": 8, + "dp-attn": False, + }, + } + ] + } + ] + } + } + full_sweep_args_multi_node.max_conc = 1 + result = generate_full_sweep( + full_sweep_args_multi_node, + config, + sample_runner_config + ) + # All conc values (100, 200, 400) > max_conc (1), so should use [1] + assert len(result) == 1 + assert result[0]["conc"] == [1] + + def test_combined_max_filters(self, sample_runner_config, full_sweep_args_single_node): + """Multiple max filters should all apply and create configs with max values.""" + config = { + "test-config": { + "image": "test-image", + "model": "test-model", + "model-prefix": "test", + "precision": "fp4", + "framework": "sglang", + "runner": "b200", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 8, "ep": 8, "conc-start": 100, "conc-end": 200} + ] + } + ] + } + } + full_sweep_args_single_node.max_tp = 2 + full_sweep_args_single_node.max_ep = 1 + full_sweep_args_single_node.max_conc = 1 + result = generate_full_sweep( + full_sweep_args_single_node, + config, + sample_runner_config + ) + # All values exceed max, so should use max values + assert len(result) == 1 + assert result[0]["tp"] == 2 + assert result[0]["ep"] == 1 + assert result[0]["conc"] == 1 diff --git a/utils/matrix_logic/test_validation.py b/utils/matrix_logic/test_validation.py new file mode 100644 index 000000000..d9cc7f0d9 --- /dev/null +++ b/utils/matrix_logic/test_validation.py @@ -0,0 +1,869 @@ +"""Comprehensive tests for validation.py""" +import pytest +from validation import ( + Fields, + SingleNodeMatrixEntry, + MultiNodeMatrixEntry, + WorkerConfig, + SingleNodeSearchSpaceEntry, + MultiNodeSearchSpaceEntry, + SingleNodeSeqLenConfig, + MultiNodeSeqLenConfig, + SingleNodeMasterConfigEntry, + MultiNodeMasterConfigEntry, + validate_matrix_entry, + validate_master_config, + validate_runner_config, + load_config_files, + load_runner_file, +) + + +# ============================================================================= +# Test Fixtures +# ============================================================================= + +@pytest.fixture +def valid_single_node_matrix_entry(): + """Valid single node matrix entry based on dsr1-fp4-mi355x-sglang config.""" + return { + "image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915", + "model": "amd/DeepSeek-R1-0528-MXFP4-Preview", + "model-prefix": "dsr1", + "precision": "fp4", + "framework": "sglang", + "spec-decoding": "none", + "runner": "mi355x", + "isl": 1024, + "osl": 1024, + "tp": 8, + "ep": 1, + "dp-attn": False, + "conc": 4, + "max-model-len": 2248, + "exp-name": "dsr1_1k1k", + "disagg": False, + } + + +@pytest.fixture +def valid_multinode_matrix_entry(): + """Valid multinode matrix entry based on dsr1-fp4-gb200-dynamo-trt config.""" + return { + "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", + "model": "deepseek-r1-fp4", + "model-prefix": "dsr1", + "precision": "fp4", + "framework": "dynamo-trt", + "spec-decoding": "none", + "runner": "gb200", + "isl": 1024, + "osl": 1024, + "prefill": { + "num-worker": 5, + "tp": 4, + "ep": 4, + "dp-attn": True, + "additional-settings": [ + "PREFILL_MAX_NUM_TOKENS=8448", + "PREFILL_MAX_BATCH_SIZE=1", + ], + }, + "decode": { + "num-worker": 1, + "tp": 8, + "ep": 8, + "dp-attn": True, + "additional-settings": [ + "DECODE_MAX_NUM_TOKENS=256", + "DECODE_MAX_BATCH_SIZE=256", + "DECODE_GPU_MEM_FRACTION=0.8", + "DECODE_MTP_SIZE=0", + ], + }, + "conc": [2150], + "max-model-len": 2248, + "exp-name": "dsr1_1k1k", + "disagg": True, + } + + +@pytest.fixture +def valid_single_node_master_config(): + """Valid single node master config based on dsr1-fp8-mi300x-sglang.""" + return { + "image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915", + "model": "deepseek-ai/DeepSeek-R1-0528", + "model-prefix": "dsr1", + "precision": "fp8", + "framework": "sglang", + "runner": "mi300x", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + } + ] + } + + +@pytest.fixture +def valid_multinode_master_config(): + """Valid multinode master config based on dsr1-fp4-gb200-dynamo-trt.""" + return { + "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", + "model": "deepseek-r1-fp4", + "model-prefix": "dsr1", + "precision": "fp4", + "framework": "dynamo-trt", + "runner": "gb200", + "multinode": True, + "disagg": True, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + { + "prefill": { + "num-worker": 5, + "tp": 4, + "ep": 4, + "dp-attn": True, + "additional-settings": [ + "PREFILL_MAX_NUM_TOKENS=8448", + "PREFILL_MAX_BATCH_SIZE=1", + ], + }, + "decode": { + "num-worker": 1, + "tp": 8, + "ep": 8, + "dp-attn": True, + "additional-settings": [ + "DECODE_MAX_NUM_TOKENS=256", + "DECODE_MAX_BATCH_SIZE=256", + ], + }, + "conc-list": [2150], + } + ] + } + ] + } + + +@pytest.fixture +def valid_runner_config(): + """Valid runner config based on .github/configs/runners.yaml.""" + return { + "h100": ["h100-cr_0", "h100-cr_1", "h100-cw_0", "h100-cw_1"], + "h200": ["h200-cw_0", "h200-cw_1", "h200-nb_0", "h200-nb_1"], + "b200": ["b200-nvd_0", "b200-nvd_1", "b200-dgxc_1"], + "mi300x": ["mi300x-amd_0", "mi300x-amd_1", "mi300x-cr_0"], + "gb200": ["gb200-nv_0"], + } + + +# ============================================================================= +# Test Fields Enum +# ============================================================================= + +class TestFieldsEnum: + """Tests for Fields enum.""" + + def test_field_values_are_strings(self): + """All field values should be strings.""" + for field in Fields: + assert isinstance(field.value, str) + + def test_key_fields_exist(self): + """Key fields should be defined.""" + assert Fields.IMAGE.value == "image" + assert Fields.MODEL.value == "model" + assert Fields.TP.value == "tp" + assert Fields.MULTINODE.value == "multinode" + assert Fields.CONC.value == "conc" + assert Fields.SPEC_DECODING.value == "spec-decoding" + assert Fields.PREFILL.value == "prefill" + assert Fields.DECODE.value == "decode" + + +# ============================================================================= +# Test WorkerConfig +# ============================================================================= + +class TestWorkerConfig: + """Tests for WorkerConfig model.""" + + def test_valid_worker_config(self): + """Valid worker config should pass.""" + config = WorkerConfig(**{ + "num-worker": 5, + "tp": 4, + "ep": 4, + "dp-attn": True, + }) + assert config.num_worker == 5 + assert config.tp == 4 + assert config.ep == 4 + assert config.dp_attn is True + + def test_worker_config_with_additional_settings(self): + """Worker config with additional settings should pass.""" + config = WorkerConfig(**{ + "num-worker": 1, + "tp": 8, + "ep": 8, + "dp-attn": True, + "additional-settings": [ + "DECODE_MAX_NUM_TOKENS=256", + "DECODE_MAX_BATCH_SIZE=256", + "DECODE_GPU_MEM_FRACTION=0.8", + ], + }) + assert len(config.additional_settings) == 3 + assert "DECODE_MAX_NUM_TOKENS=256" in config.additional_settings + + def test_worker_config_missing_required_field(self): + """Missing required field should fail.""" + with pytest.raises(Exception): + WorkerConfig(**{ + "num-worker": 2, + "tp": 4, + # Missing ep and dp-attn + }) + + def test_worker_config_extra_field_forbidden(self): + """Extra fields should be forbidden.""" + with pytest.raises(Exception): + WorkerConfig(**{ + "num-worker": 2, + "tp": 4, + "ep": 1, + "dp-attn": False, + "unknown-field": "value", + }) + + +# ============================================================================= +# Test SingleNodeMatrixEntry +# ============================================================================= + +class TestSingleNodeMatrixEntry: + """Tests for SingleNodeMatrixEntry model.""" + + def test_valid_entry(self, valid_single_node_matrix_entry): + """Valid entry should pass validation.""" + entry = SingleNodeMatrixEntry(**valid_single_node_matrix_entry) + assert entry.image == "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915" + assert entry.tp == 8 + assert entry.conc == 4 + assert entry.framework == "sglang" + + def test_conc_as_list(self, valid_single_node_matrix_entry): + """Conc can be a list of integers.""" + valid_single_node_matrix_entry["conc"] = [4, 8, 16, 32, 64] + entry = SingleNodeMatrixEntry(**valid_single_node_matrix_entry) + assert entry.conc == [4, 8, 16, 32, 64] + + def test_spec_decoding_values(self, valid_single_node_matrix_entry): + """Spec decoding should accept valid literal values.""" + for value in ["mtp", "draft_model", "none"]: + valid_single_node_matrix_entry["spec-decoding"] = value + entry = SingleNodeMatrixEntry(**valid_single_node_matrix_entry) + assert entry.spec_decoding == value + + def test_invalid_spec_decoding(self, valid_single_node_matrix_entry): + """Invalid spec decoding value should fail.""" + valid_single_node_matrix_entry["spec-decoding"] = "invalid" + with pytest.raises(Exception): + SingleNodeMatrixEntry(**valid_single_node_matrix_entry) + + def test_missing_required_field(self, valid_single_node_matrix_entry): + """Missing required field should fail validation.""" + del valid_single_node_matrix_entry["model"] + with pytest.raises(Exception): + SingleNodeMatrixEntry(**valid_single_node_matrix_entry) + + def test_extra_field_forbidden(self, valid_single_node_matrix_entry): + """Extra fields should be forbidden.""" + valid_single_node_matrix_entry["extra-field"] = "value" + with pytest.raises(Exception): + SingleNodeMatrixEntry(**valid_single_node_matrix_entry) + + +# ============================================================================= +# Test MultiNodeMatrixEntry +# ============================================================================= + +class TestMultiNodeMatrixEntry: + """Tests for MultiNodeMatrixEntry model.""" + + def test_valid_entry(self, valid_multinode_matrix_entry): + """Valid entry should pass validation.""" + entry = MultiNodeMatrixEntry(**valid_multinode_matrix_entry) + assert entry.model == "deepseek-r1-fp4" + assert entry.conc == [2150] + assert entry.disagg is True + + def test_prefill_decode_worker_configs(self, valid_multinode_matrix_entry): + """Prefill and decode should be WorkerConfig objects.""" + entry = MultiNodeMatrixEntry(**valid_multinode_matrix_entry) + assert entry.prefill.num_worker == 5 + assert entry.prefill.tp == 4 + assert entry.decode.tp == 8 + assert entry.decode.dp_attn is True + + def test_conc_must_be_list(self, valid_multinode_matrix_entry): + """Conc must be a list for multinode.""" + valid_multinode_matrix_entry["conc"] = 2150 # Single int, not list + with pytest.raises(Exception): + MultiNodeMatrixEntry(**valid_multinode_matrix_entry) + + def test_missing_prefill(self, valid_multinode_matrix_entry): + """Missing prefill should fail.""" + del valid_multinode_matrix_entry["prefill"] + with pytest.raises(Exception): + MultiNodeMatrixEntry(**valid_multinode_matrix_entry) + + def test_missing_decode(self, valid_multinode_matrix_entry): + """Missing decode should fail.""" + del valid_multinode_matrix_entry["decode"] + with pytest.raises(Exception): + MultiNodeMatrixEntry(**valid_multinode_matrix_entry) + + +# ============================================================================= +# Test validate_matrix_entry function +# ============================================================================= + +class TestValidateMatrixEntry: + """Tests for validate_matrix_entry function.""" + + def test_valid_single_node(self, valid_single_node_matrix_entry): + """Valid single node entry should return the entry.""" + result = validate_matrix_entry(valid_single_node_matrix_entry, is_multinode=False) + assert result == valid_single_node_matrix_entry + + def test_valid_multinode(self, valid_multinode_matrix_entry): + """Valid multinode entry should return the entry.""" + result = validate_matrix_entry(valid_multinode_matrix_entry, is_multinode=True) + assert result == valid_multinode_matrix_entry + + def test_invalid_single_node_raises_valueerror(self, valid_single_node_matrix_entry): + """Invalid single node entry should raise ValueError.""" + del valid_single_node_matrix_entry["tp"] + with pytest.raises(ValueError) as exc_info: + validate_matrix_entry(valid_single_node_matrix_entry, is_multinode=False) + assert "failed validation" in str(exc_info.value) + + def test_invalid_multinode_raises_valueerror(self, valid_multinode_matrix_entry): + """Invalid multinode entry should raise ValueError.""" + del valid_multinode_matrix_entry["prefill"] + with pytest.raises(ValueError) as exc_info: + validate_matrix_entry(valid_multinode_matrix_entry, is_multinode=True) + assert "failed validation" in str(exc_info.value) + + +# ============================================================================= +# Test SingleNodeSearchSpaceEntry +# ============================================================================= + +class TestSingleNodeSearchSpaceEntry: + """Tests for SingleNodeSearchSpaceEntry model.""" + + def test_valid_with_conc_range(self): + """Valid entry with conc range should pass (like mi300x config).""" + entry = SingleNodeSearchSpaceEntry(**{ + "tp": 8, + "conc-start": 4, + "conc-end": 64, + }) + assert entry.tp == 8 + assert entry.conc_start == 4 + assert entry.conc_end == 64 + + def test_valid_with_conc_list(self): + """Valid entry with conc list should pass.""" + entry = SingleNodeSearchSpaceEntry(**{ + "tp": 4, + "conc-list": [4, 8, 16, 32, 64, 128], + }) + assert entry.conc_list == [4, 8, 16, 32, 64, 128] + + def test_cannot_have_both_range_and_list(self): + """Cannot specify both conc range and list.""" + with pytest.raises(Exception) as exc_info: + SingleNodeSearchSpaceEntry(**{ + "tp": 4, + "conc-start": 4, + "conc-end": 64, + "conc-list": [4, 8, 16], + }) + assert "Cannot specify both" in str(exc_info.value) + + def test_must_have_range_or_list(self): + """Must specify either conc range or list.""" + with pytest.raises(Exception) as exc_info: + SingleNodeSearchSpaceEntry(**{ + "tp": 8, + }) + assert "Must specify either" in str(exc_info.value) + + def test_conc_start_must_be_lte_conc_end(self): + """conc-start must be <= conc-end.""" + with pytest.raises(Exception) as exc_info: + SingleNodeSearchSpaceEntry(**{ + "tp": 8, + "conc-start": 64, + "conc-end": 4, + }) + assert "must be <=" in str(exc_info.value) + + def test_conc_list_values_must_be_positive(self): + """conc-list values must be > 0.""" + with pytest.raises(Exception) as exc_info: + SingleNodeSearchSpaceEntry(**{ + "tp": 4, + "conc-list": [4, 0, 16], + }) + assert "must be greater than 0" in str(exc_info.value) + + def test_optional_fields_defaults(self): + """Optional fields should have correct defaults.""" + entry = SingleNodeSearchSpaceEntry(**{ + "tp": 8, + "conc-list": [4, 8], + }) + assert entry.ep is None + assert entry.dp_attn is None + assert entry.spec_decoding == "none" + + def test_with_ep_and_dp_attn(self): + """Entry with ep and dp-attn like b200-sglang config.""" + entry = SingleNodeSearchSpaceEntry(**{ + "tp": 4, + "ep": 4, + "dp-attn": True, + "conc-start": 4, + "conc-end": 128, + }) + assert entry.ep == 4 + assert entry.dp_attn is True + + def test_with_spec_decoding_mtp(self): + """Entry with mtp spec decoding.""" + entry = SingleNodeSearchSpaceEntry(**{ + "tp": 8, + "spec-decoding": "mtp", + "conc-list": [1, 2, 4], + }) + assert entry.spec_decoding == "mtp" + + +# ============================================================================= +# Test MultiNodeSearchSpaceEntry +# ============================================================================= + +class TestMultiNodeSearchSpaceEntry: + """Tests for MultiNodeSearchSpaceEntry model.""" + + def test_valid_with_conc_list(self): + """Valid multinode search space with list (like gb200 config).""" + entry = MultiNodeSearchSpaceEntry(**{ + "prefill": { + "num-worker": 5, + "tp": 4, + "ep": 4, + "dp-attn": True, + "additional-settings": ["PREFILL_MAX_NUM_TOKENS=8448"], + }, + "decode": { + "num-worker": 1, + "tp": 8, + "ep": 8, + "dp-attn": True, + "additional-settings": ["DECODE_MAX_NUM_TOKENS=256"], + }, + "conc-list": [2150], + }) + assert entry.prefill.num_worker == 5 + assert entry.decode.tp == 8 + + def test_valid_with_conc_range(self): + """Valid multinode search space with range.""" + entry = MultiNodeSearchSpaceEntry(**{ + "prefill": { + "num-worker": 1, + "tp": 4, + "ep": 4, + "dp-attn": False, + }, + "decode": { + "num-worker": 4, + "tp": 8, + "ep": 8, + "dp-attn": False, + }, + "conc-start": 1, + "conc-end": 64, + }) + assert entry.conc_start == 1 + assert entry.conc_end == 64 + + def test_with_spec_decoding_mtp(self): + """Multinode entry with mtp spec decoding.""" + entry = MultiNodeSearchSpaceEntry(**{ + "spec-decoding": "mtp", + "prefill": { + "num-worker": 1, + "tp": 4, + "ep": 4, + "dp-attn": False, + }, + "decode": { + "num-worker": 4, + "tp": 8, + "ep": 8, + "dp-attn": False, + }, + "conc-list": [1, 2, 4, 8, 16, 36], + }) + assert entry.spec_decoding == "mtp" + + def test_missing_conc_specification(self): + """Missing conc specification should fail.""" + with pytest.raises(Exception): + MultiNodeSearchSpaceEntry(**{ + "prefill": { + "num-worker": 2, + "tp": 4, + "ep": 4, + "dp-attn": False, + }, + "decode": { + "num-worker": 2, + "tp": 4, + "ep": 4, + "dp-attn": False, + }, + # Missing conc specification + }) + + +# ============================================================================= +# Test SeqLenConfig models +# ============================================================================= + +class TestSeqLenConfigs: + """Tests for sequence length config models.""" + + def test_single_node_seq_len_config_1k1k(self): + """Valid single node seq len config for 1k/1k.""" + config = SingleNodeSeqLenConfig(**{ + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + }) + assert config.isl == 1024 + assert config.osl == 1024 + assert len(config.search_space) == 1 + + def test_single_node_seq_len_config_8k1k(self): + """Valid single node seq len config for 8k/1k.""" + config = SingleNodeSeqLenConfig(**{ + "isl": 8192, + "osl": 1024, + "search-space": [ + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + }) + assert config.isl == 8192 + assert config.osl == 1024 + + def test_multinode_seq_len_config(self): + """Valid multinode seq len config.""" + config = MultiNodeSeqLenConfig(**{ + "isl": 1024, + "osl": 1024, + "search-space": [ + { + "prefill": { + "num-worker": 5, + "tp": 4, + "ep": 4, + "dp-attn": True, + }, + "decode": { + "num-worker": 1, + "tp": 8, + "ep": 8, + "dp-attn": True, + }, + "conc-list": [2150], + } + ] + }) + assert config.isl == 1024 + assert config.osl == 1024 + + +# ============================================================================= +# Test MasterConfigEntry models +# ============================================================================= + +class TestMasterConfigEntries: + """Tests for master config entry models.""" + + def test_single_node_master_config(self, valid_single_node_master_config): + """Valid single node master config.""" + config = SingleNodeMasterConfigEntry(**valid_single_node_master_config) + assert config.multinode is False + assert config.model_prefix == "dsr1" + assert config.runner == "mi300x" + assert config.framework == "sglang" + + def test_multinode_master_config(self, valid_multinode_master_config): + """Valid multinode master config.""" + config = MultiNodeMasterConfigEntry(**valid_multinode_master_config) + assert config.multinode is True + assert config.model_prefix == "dsr1" + assert config.runner == "gb200" + assert config.disagg is True + + def test_single_node_cannot_have_multinode_true(self, valid_single_node_master_config): + """Single node config must have multinode=False.""" + valid_single_node_master_config["multinode"] = True + with pytest.raises(Exception): + SingleNodeMasterConfigEntry(**valid_single_node_master_config) + + def test_multinode_cannot_have_multinode_false(self, valid_multinode_master_config): + """Multinode config must have multinode=True.""" + valid_multinode_master_config["multinode"] = False + with pytest.raises(Exception): + MultiNodeMasterConfigEntry(**valid_multinode_master_config) + + def test_disagg_default_false(self, valid_single_node_master_config): + """Disagg should default to False.""" + config = SingleNodeMasterConfigEntry(**valid_single_node_master_config) + assert config.disagg is False + + +# ============================================================================= +# Test validate_master_config function +# ============================================================================= + +class TestValidateMasterConfig: + """Tests for validate_master_config function.""" + + def test_valid_single_node_config(self, valid_single_node_master_config): + """Valid single node config should pass.""" + configs = {"dsr1-fp8-mi300x-sglang": valid_single_node_master_config} + result = validate_master_config(configs) + assert result == configs + + def test_valid_multinode_config(self, valid_multinode_master_config): + """Valid multinode config should pass.""" + configs = {"dsr1-fp4-gb200-dynamo-trt": valid_multinode_master_config} + result = validate_master_config(configs) + assert result == configs + + def test_mixed_configs(self, valid_single_node_master_config, valid_multinode_master_config): + """Mixed single and multinode configs should pass.""" + configs = { + "dsr1-fp8-mi300x-sglang": valid_single_node_master_config, + "dsr1-fp4-gb200-dynamo-trt": valid_multinode_master_config, + } + result = validate_master_config(configs) + assert len(result) == 2 + + def test_invalid_config_raises_valueerror(self, valid_single_node_master_config): + """Invalid config should raise ValueError with key name.""" + del valid_single_node_master_config["model"] + configs = {"broken-config": valid_single_node_master_config} + with pytest.raises(ValueError) as exc_info: + validate_master_config(configs) + assert "broken-config" in str(exc_info.value) + assert "failed validation" in str(exc_info.value) + + +# ============================================================================= +# Test validate_runner_config function +# ============================================================================= + +class TestValidateRunnerConfig: + """Tests for validate_runner_config function.""" + + def test_valid_runner_config(self, valid_runner_config): + """Valid runner config should pass.""" + result = validate_runner_config(valid_runner_config) + assert result == valid_runner_config + + def test_value_must_be_list(self): + """Runner config values must be lists.""" + config = { + "h100": "h100-cr_0", # Not a list + } + with pytest.raises(ValueError) as exc_info: + validate_runner_config(config) + assert "must be a list" in str(exc_info.value) + + def test_list_must_contain_strings(self): + """Runner config lists must contain only strings.""" + config = { + "h100": ["h100-cr_0", 123], # Contains non-string + } + with pytest.raises(ValueError) as exc_info: + validate_runner_config(config) + assert "must contain only strings" in str(exc_info.value) + + def test_list_cannot_be_empty(self): + """Runner config lists cannot be empty.""" + config = { + "mi355x": [], + } + with pytest.raises(ValueError) as exc_info: + validate_runner_config(config) + assert "cannot be an empty list" in str(exc_info.value) + + def test_multiple_runner_types(self, valid_runner_config): + """Multiple runner types should work.""" + result = validate_runner_config(valid_runner_config) + assert "h100" in result + assert "h200" in result + assert "mi300x" in result + assert "gb200" in result + + +# ============================================================================= +# Test load_config_files +# ============================================================================= + +class TestLoadConfigFiles: + """Tests for load_config_files function.""" + + def test_load_single_file_with_validation(self, tmp_path, valid_single_node_master_config): + """Should load and validate a single config file.""" + config_file = tmp_path / "config.yaml" + import yaml + config_file.write_text(yaml.dump({"test-config": valid_single_node_master_config})) + result = load_config_files([str(config_file)]) + assert "test-config" in result + assert result["test-config"]["image"] == valid_single_node_master_config["image"] + + def test_load_single_file_without_validation(self, tmp_path): + """Should load a single config file without validation when validate=False.""" + config_file = tmp_path / "config.yaml" + config_file.write_text(""" +test-config: + image: test-image + model: test-model +""") + result = load_config_files([str(config_file)], validate=False) + assert "test-config" in result + assert result["test-config"]["image"] == "test-image" + + def test_load_multiple_files(self, tmp_path): + """Should merge multiple config files.""" + config1 = tmp_path / "config1.yaml" + config1.write_text(""" +config-one: + value: 1 +""") + config2 = tmp_path / "config2.yaml" + config2.write_text(""" +config-two: + value: 2 +""") + result = load_config_files([str(config1), str(config2)], validate=False) + assert "config-one" in result + assert "config-two" in result + + def test_duplicate_keys_raise_error(self, tmp_path): + """Duplicate keys across files should raise error.""" + config1 = tmp_path / "config1.yaml" + config1.write_text(""" +duplicate-key: + value: 1 +""") + config2 = tmp_path / "config2.yaml" + config2.write_text(""" +duplicate-key: + value: 2 +""") + with pytest.raises(ValueError) as exc_info: + load_config_files([str(config1), str(config2)], validate=False) + assert "Duplicate configuration keys" in str(exc_info.value) + + def test_nonexistent_file_raises_error(self): + """Nonexistent file should raise error.""" + with pytest.raises(ValueError) as exc_info: + load_config_files(["nonexistent.yaml"]) + assert "does not exist" in str(exc_info.value) + + def test_validation_runs_by_default(self, tmp_path): + """Validation should run by default and catch invalid configs.""" + config_file = tmp_path / "config.yaml" + config_file.write_text(""" +invalid-config: + image: test-image + # Missing required fields like model, model-prefix, precision, etc. +""") + with pytest.raises(ValueError) as exc_info: + load_config_files([str(config_file)]) + assert "failed validation" in str(exc_info.value) + + +# ============================================================================= +# Test load_runner_file +# ============================================================================= + +class TestLoadRunnerFile: + """Tests for load_runner_file function.""" + + def test_load_runner_file_with_validation(self, tmp_path): + """Should load and validate runner config file.""" + runner_file = tmp_path / "runners.yaml" + runner_file.write_text(""" +h100: +- h100-node-0 +- h100-node-1 +""") + result = load_runner_file(str(runner_file)) + assert "h100" in result + assert len(result["h100"]) == 2 + + def test_load_runner_file_without_validation(self, tmp_path): + """Should load runner config file without validation when validate=False.""" + runner_file = tmp_path / "runners.yaml" + runner_file.write_text(""" +h100: +- h100-node-0 +- h100-node-1 +""") + result = load_runner_file(str(runner_file), validate=False) + assert "h100" in result + assert len(result["h100"]) == 2 + + def test_nonexistent_runner_file(self): + """Nonexistent runner file should raise error.""" + with pytest.raises(ValueError) as exc_info: + load_runner_file("nonexistent.yaml") + assert "does not exist" in str(exc_info.value) + + def test_validation_runs_by_default(self, tmp_path): + """Validation should run by default and catch invalid configs.""" + runner_file = tmp_path / "runners.yaml" + runner_file.write_text(""" +h100: not-a-list +""") + with pytest.raises(ValueError) as exc_info: + load_runner_file(str(runner_file)) + assert "must be a list" in str(exc_info.value) diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py new file mode 100644 index 000000000..4d79d27a9 --- /dev/null +++ b/utils/matrix_logic/validation.py @@ -0,0 +1,438 @@ +from pydantic import BaseModel, Field, ValidationError, ConfigDict, model_validator +from typing import List, Optional, Union, Literal +from enum import Enum + +import pprint +import yaml + +""" + The below class defines the field names expected to be present in the JSON entries + for both single-node and multi-node configurations. +""" + + +class Fields(Enum): + # Field name constants + # Top-level config fields + IMAGE = 'image' + MODEL = 'model' + MODEL_PREFIX = 'model-prefix' + PRECISION = 'precision' + FRAMEWORK = 'framework' + RUNNER = 'runner' + SEQ_LEN_CONFIGS = 'seq-len-configs' + MULTINODE = 'multinode' + + # Seq-len-config fields + ISL = 'isl' + OSL = 'osl' + SEARCH_SPACE = 'search-space' + + # Search-space/benchmark fields + TP = 'tp' + CONC_START = 'conc-start' + CONC_END = 'conc-end' + CONC_LIST = 'conc-list' + EP = 'ep' + DP_ATTN = 'dp-attn' + + # Multinode-specific fields (when MULTINODE = true) + SPEC_DECODING = 'spec-decoding' + PREFILL = 'prefill' + DECODE = 'decode' + NUM_WORKER = 'num-worker' + BATCH_SIZE = 'batch-size' + MAX_NUM_TOKENS = 'max-num-tokens' + ADDITIONAL_SETTINGS = 'additional-settings' + + # Matrix entry fields + CONC = 'conc' + MAX_MODEL_LEN = 'max-model-len' + EXP_NAME = 'exp-name' + DISAGG = 'disagg' + + # Eval + FIELD_RUN_EVAL = 'run-eval' + + +""" + Below is the validation logic for the OUTPUT of utils/matrix_logic/generate_sweep_configs.py, i.e., + the input to the actual workflow files. The validation enforces a strict set of rules on the structure + of the generated matrix entries to ensure correctness before proceeding with benchmarking. This ensures + that no validation has to happen in the workflow itself, i.e., at runtime, it is assumed that all inputs + are valid. Threfore, there should not be any default values set in these Pydantic models. Any missing value + should raise a validation error. +""" + + +class SingleNodeMatrixEntry(BaseModel): + """Pydantic model for validating single node matrix entry structure. + This validates the input that should be expected to .github/workflows/benchmark-tmpl.yml""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + image: str + model: str + model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value) + precision: str + framework: str + spec_decoding: Literal["mtp", "draft_model", "none"] = Field( + alias=Fields.SPEC_DECODING.value + ) + runner: str + isl: int + osl: int + tp: int + ep: int + dp_attn: bool = Field(alias=Fields.DP_ATTN.value) + conc: Union[int, List[int]] + max_model_len: int = Field(alias=Fields.MAX_MODEL_LEN.value) + exp_name: str = Field(alias=Fields.EXP_NAME.value) + disagg: bool + run_eval: bool = Field(alias='run-eval', default=False) + + +class WorkerConfig(BaseModel): + """Pydantic model for validating worker configuration in multinode entries.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + num_worker: int = Field(alias=Fields.NUM_WORKER.value) + tp: int + ep: int + dp_attn: bool = Field(alias=Fields.DP_ATTN.value) + additional_settings: Optional[List[str]] = Field( + default=[], alias=Fields.ADDITIONAL_SETTINGS.value) + + +class MultiNodeMatrixEntry(BaseModel): + """Pydantic model for validating multinode matrix entry structure. + This validates the input that should be expected to .github/workflows/benchmark-multinode-tmpl.yml""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + image: str + model: str + model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value) + precision: str + framework: str + spec_decoding: Literal["mtp", "draft_model", "none"] = Field( + alias=Fields.SPEC_DECODING.value + ) + runner: str + isl: int + osl: int + prefill: WorkerConfig + decode: WorkerConfig + conc: List[int] + max_model_len: int = Field(alias=Fields.MAX_MODEL_LEN.value) + exp_name: str = Field(alias=Fields.EXP_NAME.value) + disagg: bool + run_eval: bool = Field(alias='run-eval', default=False) + + +def validate_matrix_entry(entry: dict, is_multinode: bool) -> dict: + """Validate that matrix_values entries match the expected structure. + + Raises ValueError if any entry fails validation. + Returns the original list if all entries are valid. + """ + try: + if is_multinode: + MultiNodeMatrixEntry(**entry) + else: + SingleNodeMatrixEntry(**entry) + except ValidationError as e: + raise ValueError( + f"The following parsed matrix entry failed validation:\n{pprint.pformat(entry)}\n{e}") + return entry + + +""" + Below is the validation logic for the INPUT to utils/matrix_logic/generate_sweep_configs.py, i.e., + the master configuration files found in .github/configs. The validation enforces a strict set of + rules on the structure of the master configuration files to ensure correctness before proceeding + with matrix generation. +""" + + +def _validate_conc_fields(self): + """Ensure either (conc_start AND conc_end) OR conc_list is provided, but not both.""" + has_range = self.conc_start is not None and self.conc_end is not None + has_list = self.conc_list is not None and len(self.conc_list) > 0 + + if has_range and has_list: + raise ValueError( + f"Cannot specify both '{Fields.CONC_LIST.value}' list and " + f"'{Fields.CONC_START.value}'/'{Fields.CONC_END.value}'. " + "Use either a list or a range, not both." + ) + + if not has_range and not has_list: + raise ValueError( + f"Must specify either '{Fields.CONC_LIST.value}' list or both " + f"'{Fields.CONC_START.value}' and '{Fields.CONC_END.value}'." + ) + + if has_range: + if self.conc_start is None or self.conc_end is None: + raise ValueError( + f"Both '{Fields.CONC_START.value}' and '{Fields.CONC_END.value}' " + "must be provided together." + ) + + if self.conc_start > self.conc_end: + raise ValueError( + f"'{Fields.CONC_START.value}' ({self.conc_start}) must be <= " + f"'{Fields.CONC_END.value}' ({self.conc_end})." + ) + + if has_list: + if not all(x > 0 for x in self.conc_list): + raise ValueError( + f"Input '{Fields.CONC_LIST.value}' entries must be greater than 0." + ) + + return self + + +class SingleNodeSearchSpaceEntry(BaseModel): + """Single node search space configuration.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + tp: int + ep: Optional[int] = None + spec_decoding: Literal["mtp", "draft_model", "none"] = Field( + default="none", alias=Fields.SPEC_DECODING.value) + dp_attn: Optional[bool] = Field( + default=None, alias=Fields.DP_ATTN.value) + conc_start: Optional[int] = Field( + default=None, alias=Fields.CONC_START.value) + conc_end: Optional[int] = Field( + default=None, alias=Fields.CONC_END.value) + conc_list: Optional[List[int]] = Field( + default=None, alias=Fields.CONC_LIST.value) + + @model_validator(mode='after') + def validate_conc_fields(self): + return _validate_conc_fields(self) + + +class MultiNodeSearchSpaceEntry(BaseModel): + """Multinode search space configuration.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + spec_decoding: Literal["mtp", "draft_model", "none"] = Field( + default="none", alias=Fields.SPEC_DECODING.value) + prefill: WorkerConfig + decode: WorkerConfig + conc_start: Optional[int] = Field( + default=None, alias=Fields.CONC_START.value) + conc_end: Optional[int] = Field( + default=None, alias=Fields.CONC_END.value) + conc_list: Optional[List[int]] = Field( + default=None, alias=Fields.CONC_LIST.value) + + @model_validator(mode='after') + def validate_conc_fields(self): + return _validate_conc_fields(self) + + +class SingleNodeSeqLenConfig(BaseModel): + """Single node sequence length configuration.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + isl: int + osl: int + search_space: List[SingleNodeSearchSpaceEntry] = Field( + alias=Fields.SEARCH_SPACE.value) + + +class MultiNodeSeqLenConfig(BaseModel): + """Multinode sequence length configuration.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + isl: int + osl: int + search_space: List[MultiNodeSearchSpaceEntry] = Field( + alias=Fields.SEARCH_SPACE.value) + + +class SingleNodeMasterConfigEntry(BaseModel): + """Top-level single node master configuration entry.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + image: str + model: str + model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value) + precision: str + framework: str + runner: str + multinode: Literal[False] + disagg: bool = Field(default=False) + seq_len_configs: List[SingleNodeSeqLenConfig] = Field( + alias=Fields.SEQ_LEN_CONFIGS.value) + + +class MultiNodeMasterConfigEntry(BaseModel): + """Top-level multinode master configuration entry.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + image: str + model: str + model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value) + precision: str + framework: str + runner: str + multinode: Literal[True] + disagg: bool = Field(default=False) + seq_len_configs: List[MultiNodeSeqLenConfig] = Field( + alias=Fields.SEQ_LEN_CONFIGS.value) + + +def validate_master_config(master_configs: dict) -> List[dict]: + """Validate input master configuration structure.""" + for key, entry in master_configs.items(): + is_multinode = entry.get('multinode', False) + + try: + if is_multinode: + MultiNodeMasterConfigEntry(**entry) + else: + SingleNodeMasterConfigEntry(**entry) + except ValidationError as e: + raise ValueError( + f"Master config entry '{key}' failed validation:\n{e}") + return master_configs + +# Runner Config Validation + + +def validate_runner_config(runner_configs: dict) -> List[dict]: + """Validate input master configuration structure.""" + for key, value in runner_configs.items(): + if not isinstance(value, list): + raise ValueError( + f"Runner config entry '{key}' must be a list, got {type(value).__name__}") + + if not all(isinstance(item, str) for item in value): + raise ValueError( + f"Runner config entry '{key}' must contain only strings") + + if not value: + raise ValueError( + f"Runner config entry '{key}' cannot be an empty list") + + return runner_configs + + +""" + Below is the validation logic for the changelog entries found in perf-changelog.yaml. + This ensures that the changelog entries conform to the expected structure before + proceeding with processing. +""" + + +class ChangelogEntry(BaseModel): + """Pydantic model for validating changelog entry structure.""" + model_config = ConfigDict(extra="forbid", populate_by_name=True) + + config_keys: list[str] = Field(alias="config-keys", min_length=1) + description: str + + +class ChangelogMetadata(BaseModel): + """Pydantic model for validating changelog metadata structure.""" + model_config = ConfigDict(extra="forbid") + + base_ref: str + head_ref: str + entries: list[ChangelogEntry] + + +class ChangelogMatrixEntry(BaseModel): + """Pydantic model for validating final changelog matrix entry structure. + This imposes a strict contract on the output of process_changelog.py, dictated by + the expected input to the run-sweep.yml workflow file. + """ + model_config = ConfigDict(extra="forbid", populate_by_name=True) + + single_node: dict[str, list[SingleNodeMatrixEntry] + ] = Field(default_factory=dict) + multi_node: dict[str, list[MultiNodeMatrixEntry] + ] = Field(default_factory=dict) + changelog_metadata: ChangelogMetadata + + +# ============================================================================= +# File Loading Functions +# ============================================================================= + + +def load_config_files(config_files: List[str], validate: bool = True) -> dict: + """Load and merge configuration files. + + Args: + config_files: List of paths to YAML configuration files. + validate: If True, run validate_master_config on loaded data. Defaults to True. + + Returns: + Merged configuration dictionary. + + Raises: + ValueError: If file doesn't exist, isn't a dict, or has duplicate keys. + """ + all_config_data = {} + for config_file in config_files: + try: + with open(config_file, 'r') as f: + config_data = yaml.safe_load(f) + assert isinstance( + config_data, dict), f"Config file '{config_file}' must contain a dictionary" + + # Don't allow '*' wildcard in master config keys as we need to reserve these + # for expansion in process_changelog.py + for key in config_data.keys(): + if "*" in key: + raise ValueError( + f" Wildcard '*' is not allowed in master config keys: '{key}'") + + # Check for duplicate keys + duplicate_keys = set(all_config_data.keys()) & set( + config_data.keys()) + if duplicate_keys: + raise ValueError( + f"Duplicate configuration keys found in '{config_file}': {', '.join(sorted(duplicate_keys))}" + ) + + all_config_data.update(config_data) + except FileNotFoundError: + raise ValueError(f"Input file '{config_file}' does not exist.") + + if validate: + validate_master_config(all_config_data) + + return all_config_data + + +def load_runner_file(runner_file: str, validate: bool = True) -> dict: + """Load runner configuration file. + + Args: + runner_file: Path to the runner YAML configuration file. + validate: If True, run validate_runner_config on loaded data. Defaults to True. + + Returns: + Runner configuration dictionary. + + Raises: + ValueError: If file doesn't exist or fails validation. + """ + try: + with open(runner_file, 'r') as f: + runner_config = yaml.safe_load(f) + except FileNotFoundError: + raise ValueError( + f"Runner config file '{runner_file}' does not exist.") + + if validate: + validate_runner_config(runner_config) + + return runner_config From 9d4b2179486845c77dd496178812f85be293f817 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 15 Dec 2025 11:55:36 -0600 Subject: [PATCH 177/214] pt 2 manual merge conflict fixes --- .github/workflows/run-sweep.yml | 1 + perf-changelog.yaml | 4 ++++ utils/matrix_logic/generate_sweep_configs.py | 2 +- utils/process_changelog.py | 1 + 4 files changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml index e449942d1..6a459de40 100644 --- a/.github/workflows/run-sweep.yml +++ b/.github/workflows/run-sweep.yml @@ -142,6 +142,7 @@ jobs: conc: ${{ matrix.config.conc }} spec-decoding: ${{ matrix.config.spec-decoding }} disagg: ${{ matrix.config.disagg }} + run-eval: ${{ matrix.config.run-eval }} sweep-single-node-1k8k: needs: setup diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 112145f10..6098f931f 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -81,3 +81,7 @@ - Update vLLM image for NVIDIA configs from vLLM 0.11.0 to vLLM 0.11.2 - Adds kv-cache-dtype: fp8 to benchmarks/gptoss_fp4_b200_docker.sh PR: https://github.com/InferenceMAX/InferenceMAX/pull/273 +- config-keys: + - gptoss-fp4-mi300x-vllm + description: | + - Test evals diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py index cba89c448..7ca0f2996 100644 --- a/utils/matrix_logic/generate_sweep_configs.py +++ b/utils/matrix_logic/generate_sweep_configs.py @@ -571,7 +571,7 @@ def main(): '--run-evals', action='store_true', required=False, - help='When specifiedm run evals on a subset of configs.' + help='When specified, run evals on a subset of configs.' ) # Create main parser diff --git a/utils/process_changelog.py b/utils/process_changelog.py index 4a856c9a8..fc40baaf4 100644 --- a/utils/process_changelog.py +++ b/utils/process_changelog.py @@ -115,6 +115,7 @@ def main(): *MASTER_CONFIGS, "--runner-config", RUNNER_CONFIG, + "--run-evals" ], capture_output=True, text=True, From a9fad5b2014e5c3bb286a01ca7d894c418ecb9d7 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 15 Dec 2025 13:04:55 -0600 Subject: [PATCH 178/214] use double quotes for gha parsing --- .github/workflows/benchmark-tmpl.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 3379e9d15..e10a040bc 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -84,7 +84,7 @@ jobs: benchmark: runs-on: ${{ inputs.runner }} timeout-minutes: 180 - name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.framework }} ${{ inputs.precision }} ${{ inputs.run-eval && ''eval '' || '''' }}tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} conc=${{ inputs.conc }}' + name: "${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.framework }} ${{ inputs.precision }} ${{ inputs.run-eval && 'eval ' || '' }}tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} conc=${{ inputs.conc }}" steps: - name: Resource cleanup run: | From e07eb697aa94d7f1d6d12d787f77d6bf00318867 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 15 Dec 2025 13:05:52 -0600 Subject: [PATCH 179/214] getting rid of full sweep sched changes --- .github/workflows/full-sweep-1k1k-scheduler.yml | 12 ++++++++---- .github/workflows/full-sweep-1k8k-scheduler.yml | 12 ++++++++---- .github/workflows/full-sweep-8k1k-scheduler.yml | 12 ++++++++---- 3 files changed, 24 insertions(+), 12 deletions(-) diff --git a/.github/workflows/full-sweep-1k1k-scheduler.yml b/.github/workflows/full-sweep-1k1k-scheduler.yml index 945adeaa3..a8b40214e 100644 --- a/.github/workflows/full-sweep-1k1k-scheduler.yml +++ b/.github/workflows/full-sweep-1k1k-scheduler.yml @@ -16,8 +16,10 @@ jobs: - id: get-dsr1-configs run: | pip install pydantic - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1 --run-evals) - echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT + echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT get-gptoss-configs: runs-on: ubuntu-latest @@ -31,8 +33,10 @@ jobs: - id: get-gptoss-configs run: | pip install pydantic - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss --run-evals) - echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k1k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k1k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT + echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT benchmark-dsr1-multi-node: needs: get-dsr1-configs diff --git a/.github/workflows/full-sweep-1k8k-scheduler.yml b/.github/workflows/full-sweep-1k8k-scheduler.yml index 10b137c88..062f00265 100644 --- a/.github/workflows/full-sweep-1k8k-scheduler.yml +++ b/.github/workflows/full-sweep-1k8k-scheduler.yml @@ -16,8 +16,10 @@ jobs: - id: get-dsr1-configs run: | pip install pydantic - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1 --run-evals) - echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k8k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k8k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT + echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT get-gptoss-configs: runs-on: ubuntu-latest @@ -31,8 +33,10 @@ jobs: - id: get-gptoss-configs run: | pip install pydantic - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss --run-evals) - echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k8k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k8k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT + echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT benchmark-dsr1-multi-node: needs: get-dsr1-configs diff --git a/.github/workflows/full-sweep-8k1k-scheduler.yml b/.github/workflows/full-sweep-8k1k-scheduler.yml index d9fd2fd77..2b45b9679 100644 --- a/.github/workflows/full-sweep-8k1k-scheduler.yml +++ b/.github/workflows/full-sweep-8k1k-scheduler.yml @@ -16,8 +16,10 @@ jobs: - id: get-dsr1-configs run: | pip install pydantic - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1 --run-evals) - echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 8k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 8k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT + echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT get-gptoss-configs: runs-on: ubuntu-latest @@ -31,8 +33,10 @@ jobs: - id: get-gptoss-configs run: | pip install pydantic - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss --run-evals) - echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 8k1k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 8k1k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT + echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT benchmark-dsr1-multi-node: needs: get-dsr1-configs From 9275f0d8aedd1dc1fa4f281c816b1923c5e836b7 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 15 Dec 2025 13:13:54 -0600 Subject: [PATCH 180/214] add back spec decoding and disagg env vars --- .github/workflows/benchmark-tmpl.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index e10a040bc..284443961 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -75,6 +75,8 @@ env: EP_SIZE: ${{ inputs.ep }} DP_ATTENTION: ${{ inputs.dp-attn }} CONC: ${{ inputs.conc }} + SPEC_DECODING: ${{ inputs.spec-decoding }} + DISAGG: ${{ inputs.disagg }} RUN_EVAL: ${{ inputs.run-eval }} permissions: From dba25aa5f0a03b9bcde87a00eb6b5661d7603d54 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 16 Dec 2025 10:48:00 -0600 Subject: [PATCH 181/214] add an option to ONLY run evals --- utils/matrix_logic/generate_sweep_configs.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py index 7ca0f2996..b172c2efd 100644 --- a/utils/matrix_logic/generate_sweep_configs.py +++ b/utils/matrix_logic/generate_sweep_configs.py @@ -567,11 +567,16 @@ def main(): required=True, help='Configuration file holding runner information (YAML format)' ) - parent_parser.add_argument( + eval_group = parent_parser.add_mutually_exclusive_group() + eval_group.add_argument( '--run-evals', action='store_true', - required=False, - help='When specified, run evals on a subset of configs.' + help='When specified, run evals on a subset of configs (in addition to all configs).' + ) + eval_group.add_argument( + '--evals-only', + action='store_true', + help='When specified, run ONLY the eval subset (excludes non-eval configs).' ) # Create main parser @@ -736,9 +741,12 @@ def main(): else: parser.error(f"Unknown command: {args.command}") - # Choose eval (opt-in via --run-evals) - if args.run_evals: + # Handle eval options (mutually exclusive) + if args.run_evals or args.evals_only: matrix_values = mark_eval_entries(matrix_values) + # IF --evals-only is specified, filter to only eval entries + if args.evals_only: + matrix_values = [e for e in matrix_values if e.get(Fields.FIELD_RUN_EVAL.value, False)] print(json.dumps(matrix_values)) return matrix_values From 5de917bcf1f62814d0cdaebf6c059049d5b3d045 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 16 Dec 2025 11:28:46 -0600 Subject: [PATCH 182/214] remove full-sweep-test workflow and add collect-evals job to run sweep and e2e test --- .github/workflows/collect-evals.yml | 12 +- .github/workflows/e2e-tests.yml | 8 +- .github/workflows/full-sweep-test.yml | 503 -------------------------- .github/workflows/run-sweep.yml | 17 + 4 files changed, 30 insertions(+), 510 deletions(-) delete mode 100644 .github/workflows/full-sweep-test.yml diff --git a/.github/workflows/collect-evals.yml b/.github/workflows/collect-evals.yml index c45842ef2..64bf603e8 100644 --- a/.github/workflows/collect-evals.yml +++ b/.github/workflows/collect-evals.yml @@ -3,7 +3,7 @@ name: Template - Collect Evals on: workflow_call: inputs: - exp-name: + result-prefix: required: false type: string default: '' @@ -25,19 +25,19 @@ jobs: uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0 with: path: eval_results/ - pattern: ${{ inputs.exp-name && format('eval_{0}_*', inputs.exp-name) || 'eval_*' }} + pattern: ${{ inputs.result-prefix && format('eval_{0}_*', inputs.result-prefix) || 'eval_*' }} - name: Summarize evals run: | - echo "## Eval Summary - ${{ inputs.exp-name || 'all' }}" >> $GITHUB_STEP_SUMMARY + echo "## Eval Summary - ${{ inputs.result-prefix || 'all' }}" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY - python3 utils/collect_eval_results.py eval_results/ ${{ inputs.exp-name || 'all' }} >> $GITHUB_STEP_SUMMARY + python3 utils/collect_eval_results.py eval_results/ ${{ inputs.result-prefix || 'all' }} >> $GITHUB_STEP_SUMMARY - name: Upload aggregated evals uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 with: - name: eval_results_${{ inputs.exp-name || 'all' }} - path: agg_eval_${{ inputs.exp-name || 'all' }}.json + name: eval_results_${{ inputs.result-prefix || 'all' }} + path: agg_eval_${{ inputs.result-prefix || 'all' }}.json - name: Cleanup downloaded eval artifacts if: ${{ always() }} diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index 208f635d8..11057abcd 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -105,8 +105,14 @@ jobs: uses: ./.github/workflows/collect-results.yml secrets: inherit + collect-evals: + needs: [test-sweep-multi-node, test-sweep-single-node] + if: ${{ always() }} + uses: ./.github/workflows/collect-evals.yml + secrets: inherit + calc-success-rate: - needs: collect-results + needs: [collect-results, collect-evals] if: ${{ always() }} runs-on: ubuntu-latest diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml deleted file mode 100644 index 3ba954838..000000000 --- a/.github/workflows/full-sweep-test.yml +++ /dev/null @@ -1,503 +0,0 @@ -name: Test - Full Sweep - -on: - workflow_dispatch: - inputs: - run_1k1k: - type: boolean - required: false - run_8k1k: - type: boolean - required: false - run_1k8k: - type: boolean - required: false - use_h100: - type: boolean - required: false - use_h200: - type: boolean - required: false - use_b200: - type: boolean - required: false - use_mi300x: - type: boolean - required: false - use_mi325x: - type: boolean - required: false - use_mi355x: - type: boolean - required: false - use_gb200: - type: boolean - required: false - -jobs: - get-configs: - runs-on: ubuntu-latest - outputs: - dsr1-1k1k: ${{ steps.generate-configs.outputs.dsr1-1k1k }} - dsr1-1k8k: ${{ steps.generate-configs.outputs.dsr1-1k8k }} - dsr1-8k1k: ${{ steps.generate-configs.outputs.dsr1-8k1k }} - gptoss-1k1k: ${{ steps.generate-configs.outputs.gptoss-1k1k }} - gptoss-1k8k: ${{ steps.generate-configs.outputs.gptoss-1k8k }} - gptoss-8k1k: ${{ steps.generate-configs.outputs.gptoss-8k1k }} - steps: - - name: Checkout code - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 - - # This looks complicated, but it is just calling generate_sweep_configs.py conditioned on - # discrete inputs (i.e., run_1k1k, run_h100, etc.) to split the test sweep into discrete jobs - - id: generate-configs - run: | - pip install pydantic - - set -x - # Build runner type filters based on inputs - RUNNER_TYPES="${{ inputs.use_h100 && 'h100' || '' }} ${{ inputs.use_h200 && 'h200' || '' }} ${{ inputs.use_b200 && 'b200 b200-trt' || '' }} ${{ inputs.use_mi300x && 'mi300x' || '' }} ${{ inputs.use_mi325x && 'mi325x' || '' }} ${{ inputs.use_mi355x && 'mi355x' || '' }}" - - # DSR1 doesn't support H100, so exclude it - DSR1_RUNNER_TYPES=$(echo $RUNNER_TYPES | sed 's/\bh100\b//g' | xargs) - - # Generate dsr1 configs (only if we have valid runner types for DSR1) - if [ "${{ inputs.run_1k1k }}" = "true" ] && [ -n "$DSR1_RUNNER_TYPES" ]; then - DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml --run-evals) - echo "dsr1-1k1k=$DSR1_1K1K" >> $GITHUB_OUTPUT - else - echo "dsr1-1k1k=[]" >> $GITHUB_OUTPUT - fi - - if [ "${{ inputs.run_1k8k }}" = "true" ] && [ -n "$DSR1_RUNNER_TYPES" ]; then - DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml --run-evals) - echo "dsr1-1k8k=$DSR1_1K8K" >> $GITHUB_OUTPUT - else - echo "dsr1-1k8k=[]" >> $GITHUB_OUTPUT - fi - - if [ "${{ inputs.run_8k1k }}" = "true" ] && [ -n "$DSR1_RUNNER_TYPES" ]; then - DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml --run-evals) - echo "dsr1-8k1k=$DSR1_8K1K" >> $GITHUB_OUTPUT - else - echo "dsr1-8k1k=[]" >> $GITHUB_OUTPUT - fi - - # Generate gptoss configs (only if we have runner types selected) - if [ "${{ inputs.run_1k1k }}" = "true" ] && [ -n "$RUNNER_TYPES" ]; then - GPTOSS_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml --run-evals) - echo "gptoss-1k1k=$GPTOSS_1K1K" >> $GITHUB_OUTPUT - else - echo "gptoss-1k1k=[]" >> $GITHUB_OUTPUT - fi - - if [ "${{ inputs.run_1k8k }}" = "true" ] && [ -n "$RUNNER_TYPES" ]; then - GPTOSS_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml --run-evals) - echo "gptoss-1k8k=$GPTOSS_1K8K" >> $GITHUB_OUTPUT - else - echo "gptoss-1k8k=[]" >> $GITHUB_OUTPUT - fi - - if [ "${{ inputs.run_8k1k }}" = "true" ] && [ -n "$RUNNER_TYPES" ]; then - GPTOSS_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml --run-evals) - echo "gptoss-8k1k=$GPTOSS_8K1K" >> $GITHUB_OUTPUT - else - echo "gptoss-8k1k=[]" >> $GITHUB_OUTPUT - fi - - # DSR1 1K1K Benchmarks - benchmark-dsr1-1k1k: - needs: get-configs - if: ${{ needs.get-configs.outputs.dsr1-1k1k != '[]' }} - uses: ./.github/workflows/benchmark-tmpl.yml - name: dsr1 1k1k / - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-configs.outputs.dsr1-1k1k) }} - secrets: inherit - with: - exp-name: ${{ matrix.config.exp-name }} - isl: ${{ matrix.config.isl }} - osl: ${{ matrix.config.osl }} - max-model-len: ${{ matrix.config.max-model-len }} - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep }} - dp-attn: ${{ matrix.config.dp-attn }} - conc: ${{ matrix.config.conc }} - run-eval: ${{ matrix.config.run-eval }} - - collect-dsr1-1k1k-results: - needs: benchmark-dsr1-1k1k - if: ${{ always() && needs.get-configs.outputs.dsr1-1k1k != '[]' }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: "dsr1_1k1k" - - collect-dsr1-1k1k-evals: - needs: benchmark-dsr1-1k1k - if: ${{ always() && needs.get-configs.outputs.dsr1-1k1k != '[]' }} - uses: ./.github/workflows/collect-evals.yml - secrets: inherit - with: - exp-name: "dsr1_1k1k" - - # GPTOSS 1K1K Benchmarks - benchmark-gptoss-1k1k: - needs: get-configs - if: ${{ needs.get-configs.outputs.gptoss-1k1k != '[]' }} - uses: ./.github/workflows/benchmark-tmpl.yml - name: gptoss 1k1k / - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-configs.outputs.gptoss-1k1k) }} - secrets: inherit - with: - exp-name: ${{ matrix.config.exp-name }} - isl: ${{ matrix.config.isl }} - osl: ${{ matrix.config.osl }} - max-model-len: ${{ matrix.config.max-model-len }} - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep }} - dp-attn: ${{ matrix.config.dp-attn }} - conc: ${{ matrix.config.conc }} - run-eval: ${{ matrix.config.run-eval }} - - collect-gptoss-1k1k-results: - needs: benchmark-gptoss-1k1k - if: ${{ always() && needs.get-configs.outputs.gptoss-1k1k != '[]' }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: "gptoss_1k1k" - - collect-gptoss-1k1k-evals: - needs: benchmark-gptoss-1k1k - if: ${{ always() && needs.get-configs.outputs.gptoss-1k1k != '[]' }} - uses: ./.github/workflows/collect-evals.yml - secrets: inherit - with: - exp-name: "gptoss_1k1k" - - - # DSR1 8K1K Benchmarks - benchmark-dsr1-8k1k: - needs: get-configs - if: ${{ needs.get-configs.outputs.dsr1-8k1k != '[]' }} - uses: ./.github/workflows/benchmark-tmpl.yml - name: dsr1 8k1k / - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-configs.outputs.dsr1-8k1k) }} - secrets: inherit - with: - exp-name: ${{ matrix.config.exp-name }} - isl: ${{ matrix.config.isl }} - osl: ${{ matrix.config.osl }} - max-model-len: ${{ matrix.config.max-model-len }} - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep }} - dp-attn: ${{ matrix.config.dp-attn }} - conc: ${{ matrix.config.conc }} - run-eval: ${{ matrix.config.run-eval }} - - collect-dsr1-8k1k-results: - needs: benchmark-dsr1-8k1k - if: ${{ always() && needs.get-configs.outputs.dsr1-8k1k != '[]' }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: "dsr1_8k1k" - - collect-dsr1-8k1k-evals: - needs: benchmark-dsr1-8k1k - if: ${{ always() && needs.get-configs.outputs.dsr1-8k1k != '[]' }} - uses: ./.github/workflows/collect-evals.yml - secrets: inherit - with: - exp-name: "dsr1_8k1k" - - # GPTOSS 8K1K Benchmarks - benchmark-gptoss-8k1k: - needs: get-configs - if: ${{ needs.get-configs.outputs.gptoss-8k1k != '[]' }} - uses: ./.github/workflows/benchmark-tmpl.yml - name: gptoss 8k1k / - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-configs.outputs.gptoss-8k1k) }} - secrets: inherit - with: - exp-name: ${{ matrix.config.exp-name }} - isl: ${{ matrix.config.isl }} - osl: ${{ matrix.config.osl }} - max-model-len: ${{ matrix.config.max-model-len }} - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep }} - dp-attn: ${{ matrix.config.dp-attn }} - conc: ${{ matrix.config.conc }} - run-eval: ${{ matrix.config.run-eval }} - - collect-gptoss-8k1k-results: - needs: benchmark-gptoss-8k1k - if: ${{ always() && needs.get-configs.outputs.gptoss-8k1k != '[]' }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: "gptoss_8k1k" - - collect-gptoss-8k1k-evals: - needs: benchmark-gptoss-8k1k - if: ${{ always() && needs.get-configs.outputs.gptoss-8k1k != '[]' }} - uses: ./.github/workflows/collect-evals.yml - secrets: inherit - with: - exp-name: "gptoss_8k1k" - - - # DSR1 1K8K Benchmarks - benchmark-dsr1-1k8k: - needs: get-configs - if: ${{ needs.get-configs.outputs.dsr1-1k8k != '[]' }} - uses: ./.github/workflows/benchmark-tmpl.yml - name: dsr1 1k8k / - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-configs.outputs.dsr1-1k8k) }} - secrets: inherit - with: - exp-name: ${{ matrix.config.exp-name }} - isl: ${{ matrix.config.isl }} - osl: ${{ matrix.config.osl }} - max-model-len: ${{ matrix.config.max-model-len }} - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep }} - dp-attn: ${{ matrix.config.dp-attn }} - conc: ${{ matrix.config.conc }} - run-eval: ${{ matrix.config.run-eval }} - - # This is a workaround until we can integrate GB200 into master configs. - benchmark-gb200-1k1k: - if: ${{ inputs.use_gb200 && inputs.run_1k1k }} - uses: ./.github/workflows/benchmark-multinode-tmpl.yml - name: gb200 1k1k sweep - strategy: - fail-fast: false - matrix: - config: &dsr1_static_configs - - { - "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", - "model": "deepseek-r1-fp4", - "model-prefix": "dsr1", - "precision": "fp4", - "framework": "dynamo-trtllm", - "mtp": "off", - } - - { - "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", - "model": "deepseek-r1-fp4", - "model-prefix": "dsr1", - "precision": "fp4", - "framework": "dynamo-trtllm", - "mtp": "on", - } - - { - "image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1", - "model": "deepseek-ai/DeepSeek-R1-0528", - "model-prefix": "dsr1", - "precision": "fp8", - "framework": "dynamo-sglang", - "mtp": "off", - } - secrets: inherit - with: - runner: gb200 - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - exp-name: ${{ matrix.config.model-prefix }}_1k1k - isl: 1024 - osl: 1024 - max-model-len: 2048 - mtp-mode: ${{ matrix.config.mtp }} - - benchmark-gb200-1k8k: - if: ${{ inputs.use_gb200 && inputs.run_1k8k }} - uses: ./.github/workflows/benchmark-multinode-tmpl.yml - name: gb200 1k8k sweep - strategy: - fail-fast: false - matrix: - config: *dsr1_static_configs - secrets: inherit - with: - runner: gb200 - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - exp-name: ${{ matrix.config.model-prefix }}_1k8k - isl: 1024 - osl: 8192 - max-model-len: 9216 - mtp-mode: ${{ matrix.config.mtp }} - - benchmark-gb200-8k1k: - if: ${{ inputs.use_gb200 && inputs.run_8k1k }} - uses: ./.github/workflows/benchmark-multinode-tmpl.yml - name: gb200 8k1k sweep - strategy: - fail-fast: false - matrix: - config: *dsr1_static_configs - secrets: inherit - with: - runner: gb200 - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - exp-name: ${{ matrix.config.model-prefix }}_8k1k - isl: 1024 - osl: 8192 - max-model-len: 9216 - mtp-mode: ${{ matrix.config.mtp }} - - collect-dsr1-1k8k-results: - needs: - [ - benchmark-dsr1-1k8k, - benchmark-gb200-1k1k, - benchmark-gb200-1k8k, - benchmark-gb200-8k1k, - ] - if: ${{ always() && needs.get-configs.outputs.dsr1-1k8k != '[]' }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: "dsr1_1k8k" - - collect-dsr1-1k8k-evals: - needs: benchmark-dsr1-1k8k - if: ${{ always() && needs.get-configs.outputs.dsr1-1k8k != '[]' }} - uses: ./.github/workflows/collect-evals.yml - secrets: inherit - with: - exp-name: "dsr1_1k8k" - - - # GPTOSS 1K8K Benchmarks - benchmark-gptoss-1k8k: - needs: get-configs - if: ${{ needs.get-configs.outputs.gptoss-1k8k != '[]' }} - uses: ./.github/workflows/benchmark-tmpl.yml - name: gptoss 1k8k / - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-configs.outputs.gptoss-1k8k) }} - secrets: inherit - with: - exp-name: ${{ matrix.config.exp-name }} - isl: ${{ matrix.config.isl }} - osl: ${{ matrix.config.osl }} - max-model-len: ${{ matrix.config.max-model-len }} - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep }} - dp-attn: ${{ matrix.config.dp-attn }} - conc: ${{ matrix.config.conc }} - run-eval: ${{ matrix.config.run-eval }} - - collect-gptoss-1k8k-results: - needs: benchmark-gptoss-1k8k - if: ${{ always() && needs.get-configs.outputs.gptoss-1k8k != '[]' }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: "gptoss_1k8k" - - collect-gptoss-1k8k-evals: - needs: benchmark-gptoss-1k8k - if: ${{ always() && needs.get-configs.outputs.gptoss-1k8k != '[]' }} - uses: ./.github/workflows/collect-evals.yml - secrets: inherit - with: - exp-name: "gptoss_1k8k" - - - calc-success-rate: - needs: - [ - collect-dsr1-1k1k-results, - collect-dsr1-1k8k-results, - collect-dsr1-8k1k-results, - collect-gptoss-1k1k-results, - collect-gptoss-1k8k-results, - collect-gptoss-8k1k-results, - ] - if: ${{ always() }} - runs-on: ubuntu-latest - - env: - RESULTS_DIR: "results/" - STATS_FILENAME: "run_stats" - GITHUB_TOKEN: ${{ secrets.REPO_PAT }} - - steps: - - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 - with: - token: ${{ secrets.REPO_PAT }} - fetch-depth: 0 - - - name: Download results artifacts - uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0 - with: - path: ${{ env.RESULTS_DIR }} - pattern: results_* - - - name: Install python dependencies - run: pip install PyGithub - - - name: Calculate success rate - run: python3 utils/calc_success_rate.py $STATS_FILENAME - - - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 - with: - name: "run-stats" - path: ${{ env.STATS_FILENAME }}.json diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml index 6a459de40..adcfd3dbf 100644 --- a/.github/workflows/run-sweep.yml +++ b/.github/workflows/run-sweep.yml @@ -185,6 +185,23 @@ jobs: with: result-prefix: "bmk" + collect-evals: + needs: + [ + sweep-single-node-1k1k, + sweep-single-node-1k8k, + sweep-single-node-8k1k, + sweep-multi-node-1k1k, + sweep-multi-node-1k8k, + sweep-multi-node-8k1k, + setup, + ] + if: ${{ always() && needs.setup.result != 'skipped' }} + uses: ./.github/workflows/collect-evals.yml + secrets: inherit + with: + result-prefix: "bmk" + upload-changelog-metadata: needs: [setup, collect-results] if: ${{ always() && needs.setup.result != 'skipped' }} From 37d05d32d4c302bac0a4a8b0f843bc76feaacc52 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 16 Dec 2025 11:53:52 -0600 Subject: [PATCH 183/214] add run-eval to e2e tests --- .github/workflows/benchmark-tmpl.yml | 7 +++---- .github/workflows/e2e-tests.yml | 1 + utils/matrix_logic/generate_sweep_configs.py | 4 ++-- utils/matrix_logic/validation.py | 6 +++--- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 284443961..08a4b5fef 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -50,14 +50,13 @@ on: disagg: required: true type: string + run-eval: + type: boolean + required: true random-range-ratio: required: false type: string default: '0.8' - run-eval: - type: boolean - required: false - default: false env: HF_TOKEN: ${{ secrets.HF_TOKEN }} HF_HUB_CACHE: '/mnt/hf_hub_cache/' diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index 11057abcd..1fca38d1b 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -98,6 +98,7 @@ jobs: conc: ${{ matrix.config.conc }} spec-decoding: ${{ matrix.config.spec-decoding }} disagg: ${{ matrix.config.disagg }} + run-eval: ${{ matrix.config.run-eval }} collect-results: needs: [test-sweep-multi-node, test-sweep-single-node] diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py index b172c2efd..db6826079 100644 --- a/utils/matrix_logic/generate_sweep_configs.py +++ b/utils/matrix_logic/generate_sweep_configs.py @@ -83,7 +83,7 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]: # Mark the selected entries for i, entry in enumerate(matrix_values): - entry[Fields.FIELD_RUN_EVAL.value] = i in eval_indices + entry[Fields.RUN_EVAL.value] = i in eval_indices return matrix_values @@ -746,7 +746,7 @@ def main(): matrix_values = mark_eval_entries(matrix_values) # IF --evals-only is specified, filter to only eval entries if args.evals_only: - matrix_values = [e for e in matrix_values if e.get(Fields.FIELD_RUN_EVAL.value, False)] + matrix_values = [e for e in matrix_values if e.get(Fields.RUN_EVAL.value, False)] print(json.dumps(matrix_values)) return matrix_values diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py index 4d79d27a9..424763914 100644 --- a/utils/matrix_logic/validation.py +++ b/utils/matrix_logic/validation.py @@ -52,7 +52,7 @@ class Fields(Enum): DISAGG = 'disagg' # Eval - FIELD_RUN_EVAL = 'run-eval' + RUN_EVAL = 'run-eval' """ @@ -88,7 +88,7 @@ class SingleNodeMatrixEntry(BaseModel): max_model_len: int = Field(alias=Fields.MAX_MODEL_LEN.value) exp_name: str = Field(alias=Fields.EXP_NAME.value) disagg: bool - run_eval: bool = Field(alias='run-eval', default=False) + run_eval: bool = Field(alias=Fields.RUN_EVAL.value, default=False) class WorkerConfig(BaseModel): @@ -125,7 +125,7 @@ class MultiNodeMatrixEntry(BaseModel): max_model_len: int = Field(alias=Fields.MAX_MODEL_LEN.value) exp_name: str = Field(alias=Fields.EXP_NAME.value) disagg: bool - run_eval: bool = Field(alias='run-eval', default=False) + run_eval: bool = Field(alias=Fields.RUN_EVAL.value, default=False) def validate_matrix_entry(entry: dict, is_multinode: bool) -> dict: From 6a546e531cc3fb9d866f1cb14ca752290894cf0a Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Tue, 16 Dec 2025 10:01:11 -0800 Subject: [PATCH 184/214] math500 prompt and h200 trt evals --- benchmarks/gptoss_fp4_h200_trt_slurm.sh | 7 +++++++ utils/evals/math500.yaml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh index 12a6af5b7..26043d322 100644 --- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh @@ -74,3 +74,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + append_lm_eval_summary +fi +set +x \ No newline at end of file diff --git a/utils/evals/math500.yaml b/utils/evals/math500.yaml index 09051d118..2e172e7f0 100644 --- a/utils/evals/math500.yaml +++ b/utils/evals/math500.yaml @@ -9,7 +9,7 @@ dataset_name: algebra output_type: generate_until training_split: train test_split: test -doc_to_text: "You are solving competition math problems.\n\nFormat rules:\n- Answer in a new. That line must start with `Answer: ` (capital A, colon, one space).\n- After `Answer: `, write ONLY the answer as inline LaTeX.\n- Use ONLY ASCII LaTeX commands (e.g. \\pi, \\frac{1}{2}, -). NO Unicode symbols.\n- Do NOT wrap the answer in $, $$, \\( \\), \\[ \\], or any other delimiters.\n- Do NOT use \\displaystyle or any display-style commands. Answer only this problem, the rest are examples. Problem: {{problem}}\n" +doc_to_text: "You are solving competition math problems.\n\nFormat rules:\n- Answer in a new line that starts with `Answer: `.\n- After `Answer: `, write ONLY the answer as inline LaTeX.\n- Use ONLY ASCII LaTeX commands (e.g. \\pi, \\frac{1}{2}, -). NO Unicode symbols.\n- Do NOT wrap the answer in $, $$, \\( \\), \\[ \\], or any other delimiters.\n- Do NOT use \\displaystyle or any display-style commands. Answer only this problem, the rest are examples. Problem: {{problem}}\n" process_results: !function utils.process_results doc_to_target: "{{answer}}" generation_kwargs: From d299d417de828c09d272fce529f2df8dd5aaffef Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 16 Dec 2025 13:14:57 -0600 Subject: [PATCH 185/214] remove run prefix --- .github/workflows/e2e-tests.yml | 2 ++ .github/workflows/run-sweep.yml | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index 1fca38d1b..7e128253d 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -105,6 +105,8 @@ jobs: if: ${{ always() }} uses: ./.github/workflows/collect-results.yml secrets: inherit + with: + result-prefix: "bmk" collect-evals: needs: [test-sweep-multi-node, test-sweep-single-node] diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml index adcfd3dbf..224bae7f9 100644 --- a/.github/workflows/run-sweep.yml +++ b/.github/workflows/run-sweep.yml @@ -199,8 +199,6 @@ jobs: if: ${{ always() && needs.setup.result != 'skipped' }} uses: ./.github/workflows/collect-evals.yml secrets: inherit - with: - result-prefix: "bmk" upload-changelog-metadata: needs: [setup, collect-results] From 569d0c3607b28f42189b20508f4d354fa99664c3 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 16 Dec 2025 13:54:10 -0600 Subject: [PATCH 186/214] add result-prefix to benchmark tmpl uploaded artifacts --- .github/workflows/benchmark-tmpl.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 08a4b5fef..6f2dead4f 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -173,10 +173,11 @@ jobs: RUNNER_TYPE: ${{ inputs.runner }} run: | python3 utils/process_result.py + - name: Upload result uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0 with: - name: ${{ env.RESULT_FILENAME }} + name: bmk_${{ env.RESULT_FILENAME }} path: agg_${{ env.RESULT_FILENAME }}.json - name: Upload eval results (if any) From 30a3431bbc23fe4a40558dcf7dadc4ce6b634a9f Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Tue, 16 Dec 2025 20:35:05 -0800 Subject: [PATCH 187/214] Evals summary refactor --- utils/collect_eval_results.py | 369 +++++++++++++--------------------- utils/summarize.py | 204 +++++++++++-------- 2 files changed, 257 insertions(+), 316 deletions(-) diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py index 4f6f0dd30..bb089d519 100644 --- a/utils/collect_eval_results.py +++ b/utils/collect_eval_results.py @@ -1,235 +1,138 @@ #!/usr/bin/env python3 -import os import sys import json from pathlib import Path from typing import Any, Dict, List, Optional, Tuple +from tabulate import tabulate + +# Import shared utilities from summarize +sys.path.insert(0, str(Path(__file__).resolve().parent)) +from summarize import ( + load_json, MODEL, HARDWARE, FRAMEWORK, PRECISION, + TP, EP, CONC, DP_ATTENTION, TASK, EM_STRICT, EM_FLEXIBLE, N_EFF +) def find_eval_sets(root: Path) -> List[Path]: """Return directories that contain a meta_env.json (one set per job). - - New structure: each downloaded artifact is placed under - eval_results// with flat files inside, e.g.: - - meta_env.json - - results_*.json - - We first check immediate child directories for meta_env.json to avoid - descending unnecessarily. If nothing is found (backward compatibility), - fall back to recursive search. + + Structure: eval_results//meta_env.json """ out: List[Path] = [] - # Prefer immediate children (one directory per artifact) try: for d in root.iterdir(): if d.is_dir() and (d / 'meta_env.json').exists(): out.append(d) except Exception: pass - if out: - return out - # Fallback: recursive (legacy structure) - for p in root.rglob('meta_env.json'): - out.append(p.parent) return out -def load_json(path: Path) -> Optional[Dict[str, Any]]: - try: - with open(path, 'r') as f: - return json.load(f) - except Exception: - return None - - def detect_eval_jsons(d: Path) -> Tuple[Optional[Path], Optional[Path]]: - """Return (lm_eval_json, lighteval_json) if present (latest by mtime). - - New structure places result JSONs flat in the artifact directory. We - first check only the immediate directory for JSONs, then fall back to - recursive search for backward compatibility. + """Return (lm_eval_json, lighteval_json) if present. + + Checks immediate directory for result JSONs. """ - def scan_jsons(paths: List[Path]) -> Tuple[List[Tuple[float, Path]], List[Tuple[float, Path]]]: - """Classify JSON files into lm-eval vs lighteval buckets. - - Returns two lists of (mtime, path) where: - - The first list contains candidates that look like lm-eval outputs. - - The second list contains candidates that look like lighteval outputs. - - Heuristics used (order matters): - - If a JSON has keys like 'lm_eval_version' or 'pretty_env_info', - we treat it as an lm-eval result file. - - If it has both 'config_general' and 'results', we treat it as - a lighteval result file. - - If it only has a top-level 'results' but none of the stronger - signals above, we fall back to classifying it as lm-eval. - - We keep the file modification time to later choose the most recent - candidate; if obtaining mtime fails, we fall back to 0. - """ - lm: List[Tuple[float, Path]] = [] - le: List[Tuple[float, Path]] = [] - for p in paths: - if p.name == 'meta_env.json': - continue - data = load_json(p) - if not isinstance(data, dict): - continue - if 'lm_eval_version' in data or 'pretty_env_info' in data: - # lm-eval harness output - try: - lm.append((p.stat().st_mtime, p)) - except Exception: - lm.append((0, p)) - elif 'config_general' in data and 'results' in data: - # lighteval output structure - try: - le.append((p.stat().st_mtime, p)) - except Exception: - le.append((0, p)) - elif 'results' in data: - # Fallback: treat as lm-eval JSON - try: - lm.append((p.stat().st_mtime, p)) - except Exception: - lm.append((0, p)) - return lm, le - - # 1) Prefer immediate JSONs (flat structure) - immediate_jsons = list(d.glob('results*.json')) + [p for p in d.glob('*.json') if p.name != 'meta_env.json'] - lm, le = scan_jsons(immediate_jsons) - - # 2) If nothing found, fallback to deep scan (legacy) - if not lm and not le: - deep_jsons = list(d.rglob('*.json')) - lm, le = scan_jsons(deep_jsons) - - lm_path = sorted(lm, key=lambda x: x[0])[-1][1] if lm else None - le_path = sorted(le, key=lambda x: x[0])[-1][1] if le else None + immediate_jsons = list(d.glob('results*.json')) + [ + p for p in d.glob('*.json') if p.name != 'meta_env.json' + ] + + lm_path = None + le_path = None + + for p in immediate_jsons: + data = load_json(p) + if not isinstance(data, dict): + continue + + if 'lm_eval_version' in data: + # lm-eval harness - pick latest if multiple + if lm_path is None or p.stat().st_mtime > lm_path.stat().st_mtime: + lm_path = p + elif 'config_general' in data and 'results' in data: + # lighteval - pick latest if multiple + if le_path is None or p.stat().st_mtime > le_path.stat().st_mtime: + le_path = p + return lm_path, le_path -def extract_lm_metrics(json_path: Path, task: Optional[str] = None) -> Dict[str, Any]: +def extract_lm_metrics(json_path: Path) -> Dict[str, Any]: + """Extract metrics from lm-eval harness result JSON. + + Uses explicit structure from the JSON file: + - Task name from results keys + - Metric name from configs.metric_list + - Filter names from configs.filter_list + - Values from results[task][metric,filter] + """ data = load_json(json_path) or {} - results = data.get('results') or {} - # Determine task key robustly: - # 1) explicit argument - # 2) only key in `results` - # 3) only key in `configs` - # 4) 'unknown' - t = task - if not t: - if isinstance(results, dict) and len(results) == 1: - t = next(iter(results.keys())) - else: - cfgs = data.get('configs') or {} - if isinstance(cfgs, dict) and len(cfgs) == 1: - t = next(iter(cfgs.keys())) - else: - # fallback to arbitrary but stable choice - t = next(iter(results.keys()), 'unknown') if isinstance(results, dict) else 'unknown' - - res = results.get(t, {}) if isinstance(results, dict) else {} - - # Determine base metric name (e.g., 'exact_match') - base_metric: Optional[str] = None - hib = (data.get('higher_is_better') or {}).get(t) if isinstance(data.get('higher_is_better'), dict) else None - if isinstance(hib, dict) and hib: - base_metric = next(iter(hib.keys())) - if not base_metric: - cfg = (data.get('configs') or {}).get(t, {}) if isinstance(data.get('configs'), dict) else {} - ml = cfg.get('metric_list') if isinstance(cfg, dict) else None - if isinstance(ml, list) and ml: - m0 = ml[0] or {} - if isinstance(m0, dict): - base_metric = m0.get('metric') - if not base_metric: - # Fallback: infer from result keys - if isinstance(res, dict): - for k in res.keys(): - if isinstance(k, str) and ',' in k: - base_metric = k.split(',', 1)[0] - break - if not base_metric and 'exact_match' in res: - base_metric = 'exact_match' - if not base_metric: - base_metric = 'exact_match' - - # Determine filter names and map to strict/flexible logically without guessing - strict_name: Optional[str] = None - flex_name: Optional[str] = None - cfg = (data.get('configs') or {}).get(t, {}) if isinstance(data.get('configs'), dict) else {} - fl = cfg.get('filter_list') if isinstance(cfg, dict) else None - filter_names: List[str] = [] - if isinstance(fl, list): - for it in fl: - if isinstance(it, dict): - nm = it.get('name') - if isinstance(nm, str): - filter_names.append(nm) - # Prefer semantic names when present; otherwise preserve file order - for nm in filter_names: - if strict_name is None and 'strict' in nm.lower(): - strict_name = nm - if flex_name is None and ('flex' in nm.lower() or 'extract' in nm.lower()): - flex_name = nm - # Fallback to first/second if semantic match not found - if not strict_name and filter_names: - strict_name = filter_names[0] - if not flex_name and len(filter_names) >= 2: - flex_name = filter_names[1] - - # Extract metrics present in results using derived keys - def get_pair(fname: Optional[str]) -> Tuple[Optional[float], Optional[float]]: - if not fname: - # try unfiltered key - v = res.get(base_metric) - se = res.get(f"{base_metric}_stderr") - try: - return float(v) if v is not None else None, float(se) if se is not None else None - except Exception: - return v, se - v = res.get(f"{base_metric},{fname}") - se = res.get(f"{base_metric}_stderr,{fname}") - try: - return float(v) if v is not None else None, float(se) if se is not None else None - except Exception: - return v, se - - strict, strict_se = get_pair(strict_name) - flex, flex_se = get_pair(flex_name) - - n_eff = None - ns = data.get('n-samples') or data.get('n_samples') or {} - if isinstance(ns, dict): - td = ns.get(t) or {} - if isinstance(td, dict): - n_eff = td.get('effective') or td.get('n_eff') - + results = data.get('results', {}) + configs = data.get('configs', {}) + + if not results: + return {} + + # 1. Task: first key from results + task = next(iter(results.keys())) + + # 2. Base metric: from config's metric_list + metric_list = configs.get(task, {}).get('metric_list', []) + base_metric = metric_list[0]['metric'] if metric_list else 'exact_match' + + # 3. Filters: from config's filter_list + filter_list = configs.get(task, {}).get('filter_list', []) + + strict_val, strict_se = None, None + flex_val, flex_se = None, None + + # Helper to get value/stderr pair for filtered metrics + def get_val_se(filter_name: str) -> Tuple[Optional[float], Optional[float]]: + val_key = f"{base_metric},{filter_name}" + se_key = f"{base_metric}_stderr,{filter_name}" + return results[task].get(val_key), results[task].get(se_key) + + # Extract metrics based on filter_list + if not filter_list: + # No filters - use base metric for strict + strict_val = results[task].get(base_metric) + strict_se = results[task].get(f"{base_metric}_stderr") + else: + # Extract metrics for each filter + for f in filter_list: + fname = f['name'] + if 'strict' in fname: + strict_val, strict_se = get_val_se(fname) + elif 'flex' in fname or 'extract' in fname: + flex_val, flex_se = get_val_se(fname) + + # N-samples (effective count) + n_eff = data.get('n-samples', {}).get(task, {}).get('effective') + + # Model name model = ( - data.get('model_name') - or (data.get('configs', {}).get(t, {}) or {}).get('metadata', {}).get('model') - or (data.get('config') or {}).get('model') - or '' + data.get('model_name') + or configs.get(task, {}).get('metadata', {}).get('model') ) return { - 'task': t, - 'strict': strict, - 'flex': flex, + 'task': task, + 'strict': strict_val, 'strict_se': strict_se, + 'flex': flex_val, 'flex_se': flex_se, 'n_eff': n_eff, - 'hardware': 'Unknown GPU', 'model': model, 'source': str(json_path) } def extract_lighteval_metrics(json_path: Path, task_base: Optional[str] = None) -> Dict[str, Any]: + """Extract metrics from lighteval result JSON.""" data = load_json(json_path) or {} results = data.get('results', {}) or {} - # Choose a task key starting with task_base if provided, else 'all', else first key + + # Find task key key = None if task_base: for k in results.keys(): @@ -237,12 +140,12 @@ def extract_lighteval_metrics(json_path: Path, task_base: Optional[str] = None) key = k break if key is None: - key = 'all' if 'all' in results else (next(iter(results.keys())) if results else 'unknown') - r = results.get(key, {}) if isinstance(results, dict) else {} + key = next(iter(results.keys())) if results else 'unknown' + + r = results.get(key, {}) em = r.get('extractive_match') em_se = r.get('extractive_match_stderr') - model = '' cg = data.get('config_general', {}) or {} model = cg.get('model_name') or cg.get('model_config', {}).get('model_name', '') @@ -253,13 +156,13 @@ def extract_lighteval_metrics(json_path: Path, task_base: Optional[str] = None) 'strict_se': em_se, 'flex_se': None, 'n_eff': None, - 'hardware': 'Unknown GPU', 'model': model, 'source': str(json_path) } def pct(x: Any) -> str: + """Format value as percentage.""" try: return f"{float(x)*100:.2f}%" except Exception: @@ -267,6 +170,7 @@ def pct(x: Any) -> str: def se(x: Any) -> str: + """Format stderr as percentage with ± prefix.""" try: return f" ±{float(x)*100:.2f}%" except Exception: @@ -279,13 +183,14 @@ def main(): sys.exit(1) root = Path(sys.argv[1]) - exp_name = sys.argv[2] or 'all' + exp_name = sys.argv[2] rows: List[Dict[str, Any]] = [] for d in find_eval_sets(root): meta = load_json(d / 'meta_env.json') or {} lm_path, le_path = detect_eval_jsons(d) - # Prefer lm-eval when available, else lighteval + + # Extract metrics (prefer lm-eval) if lm_path: m = extract_lm_metrics(lm_path) elif le_path: @@ -293,16 +198,20 @@ def main(): else: continue + if not m: + continue + + # Build row from meta + metrics row = { - 'model': m.get('model') or meta.get('model') or 'unknown', - 'hw': (meta.get('hw') or 'unknown').upper(), - 'framework': (meta.get('framework') or 'unknown').lower(), - 'precision': (meta.get('precision') or 'unknown').lower(), - 'tp': int(meta.get('tp') or 1), - 'ep': int(meta.get('ep') or 1), - 'conc': int(meta.get('conc') or 0), - 'dp_attention': str(meta.get('dp_attention') or 'false'), - 'task': m.get('task') or 'unknown', + 'model': m.get('model') or meta.get('model', 'unknown'), + 'hw': meta.get('hw', 'unknown').upper(), + 'framework': meta.get('framework', 'unknown').lower(), + 'precision': meta.get('precision', 'unknown').lower(), + 'tp': int(meta.get('tp', 1)), + 'ep': int(meta.get('ep', 1)), + 'conc': int(meta.get('conc', 0)), + 'dp_attention': str(meta.get('dp_attention', False)).lower(), + 'task': m.get('task', 'unknown'), 'em_strict': m.get('strict'), 'em_strict_se': m.get('strict_se'), 'em_flexible': m.get('flex'), @@ -314,31 +223,37 @@ def main(): # Sort for stable output rows.sort(key=lambda r: ( - r.get('hw',''), r.get('framework',''), - r.get('precision',''), r.get('tp',0), r.get('conc',0) + r['hw'], r['framework'], r['precision'], r['tp'], r['conc'] )) if not rows: print('> No eval results found to summarize.') else: - # Print Markdown summary table - print('| Model | Hardware | Framework | Precision | TP | EP | Conc | DPA | Task | EM Strict | EM Flexible | N (eff) |') - print('| :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |') - for r in rows: - print( - f"| {r['model']} " - f"| {r['hw']} " - f"| {r['framework'].upper()} " - f"| {r['precision'].upper()} " - f"| {r['tp']} " - f"| {r['ep']} " - f"| {r['conc']} " - f"| {r['dp_attention']} " - f"| {r['task']} " - f"| {pct(r['em_strict'])}{se(r['em_strict_se'])} " - f"| {pct(r['em_flexible'])}{se(r['em_flexible_se'])} " - f"| {r['n_eff'] or ''} |" - ) + # Print table using tabulate + headers = [ + MODEL, HARDWARE, FRAMEWORK, PRECISION, TP, EP, CONC, DP_ATTENTION, + TASK, EM_STRICT, EM_FLEXIBLE, N_EFF + ] + + table_rows = [ + [ + r['model'], + r['hw'], + r['framework'].upper(), + r['precision'].upper(), + r['tp'], + r['ep'], + r['conc'], + r['dp_attention'], + r['task'], + f"{pct(r['em_strict'])}{se(r['em_strict_se'])}", + f"{pct(r['em_flexible'])}{se(r['em_flexible_se'])}", + r['n_eff'] or '' + ] + for r in rows + ] + + print(tabulate(table_rows, headers=headers, tablefmt="github")) # Write JSON aggregate out_path = Path(f'agg_eval_{exp_name}.json') @@ -347,4 +262,4 @@ def main(): if __name__ == '__main__': - main() + main() \ No newline at end of file diff --git a/utils/summarize.py b/utils/summarize.py index a46c2e02a..c40754ab7 100644 --- a/utils/summarize.py +++ b/utils/summarize.py @@ -1,6 +1,7 @@ import sys import json from pathlib import Path +from typing import Any, Dict, Optional from tabulate import tabulate # Header constants @@ -33,95 +34,120 @@ DECODE_WORKERS = "Decode Workers" DECODE_GPUS = "Decode GPUs" -results = [] -results_dir = Path(sys.argv[1]) -for result_path in results_dir.rglob('*.json'): - with open(result_path) as f: - result = json.load(f) - results.append(result) - -single_node_results = [r for r in results if not r['is_multinode']] -multinode_results = [r for r in results if r['is_multinode']] - -# Single-node and multi-node results have different fields and therefore need to be printed separately -if single_node_results: - single_node_results.sort(key=lambda r: ( - r['infmax_model_prefix'], r['hw'], r['framework'], r['precision'], r['isl'], r['osl'], r['tp'], r['ep'], r['conc'])) - - single_node_headers = [ - MODEL, SERVED_MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL, TP, EP, DP_ATTENTION, - CONC, TTFT, TPOT, INTERACTIVITY, E2EL, TPUT_PER_GPU, OUTPUT_TPUT_PER_GPU, INPUT_TPUT_PER_GPU - ] - - single_node_rows = [ - [ - r['infmax_model_prefix'], - r['model'], - r['hw'].upper(), - r['framework'].upper(), - r['precision'].upper(), - r['isl'], - r['osl'], - r['tp'], - r['ep'], - r['dp_attention'], - r['conc'], - f"{r['median_ttft'] * 1000:.4f}", - f"{r['median_tpot'] * 1000:.4f}", - f"{r['median_intvty']:.4f}", - f"{r['median_e2el']:.4f}", - f"{r['tput_per_gpu']:.4f}", - f"{r['output_tput_per_gpu']:.4f}", - f"{r['input_tput_per_gpu']:.4f}", +# Eval constants +TASK = "Task" +EM_STRICT = "EM Strict" +EM_FLEXIBLE = "EM Flexible" +N_EFF = "N (eff)" + + +def load_json(path: Path) -> Optional[Dict[str, Any]]: + """Load JSON file and return dict, or None on error.""" + try: + with open(path, 'r') as f: + return json.load(f) + except Exception: + return None + + +def main(): + if len(sys.argv) < 2: + print("Usage: python summarize.py ") + sys.exit(1) + + results = [] + results_dir = Path(sys.argv[1]) + for result_path in results_dir.rglob('*.json'): + result = load_json(result_path) + if result and 'is_multinode' in result: + results.append(result) + + single_node_results = [r for r in results if not r['is_multinode']] + multinode_results = [r for r in results if r['is_multinode']] + + # Single-node and multi-node results have different fields and therefore need to be printed separately + if single_node_results: + single_node_results.sort(key=lambda r: ( + r['infmax_model_prefix'], r['hw'], r['framework'], r['precision'], r['isl'], r['osl'], r['tp'], r['ep'], r['conc'])) + + single_node_headers = [ + MODEL, SERVED_MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL, TP, EP, DP_ATTENTION, + CONC, TTFT, TPOT, INTERACTIVITY, E2EL, TPUT_PER_GPU, OUTPUT_TPUT_PER_GPU, INPUT_TPUT_PER_GPU ] - for r in single_node_results - ] - - print("## Single-Node Results\n") - print(tabulate(single_node_rows, headers=single_node_headers, tablefmt="github")) - print("\n") - -if multinode_results: - multinode_results.sort(key=lambda r: (r['infmax_model_prefix'], r['hw'], r['framework'], r['precision'], r['isl'], - r['osl'], r['prefill_tp'], r['prefill_ep'], r['decode_tp'], r['decode_ep'], r['conc'])) - - multinode_headers = [ - MODEL, SERVED_MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL, - PREFILL_TP, PREFILL_EP, PREFILL_DP_ATTN, PREFILL_WORKERS, PREFILL_GPUS, - DECODE_TP, DECODE_EP, DECODE_DP_ATTN, DECODE_WORKERS, DECODE_GPUS, - CONC, TTFT, TPOT, INTERACTIVITY, E2EL, TPUT_PER_GPU, OUTPUT_TPUT_PER_GPU, INPUT_TPUT_PER_GPU - ] - - multinode_rows = [ - [ - r['infmax_model_prefix'], - r['model'], - r['hw'].upper(), - r['framework'].upper(), - r['precision'].upper(), - r['isl'], - r['osl'], - r['prefill_tp'], - r['prefill_ep'], - r['prefill_dp_attention'], - r['prefill_num_workers'], - r['num_prefill_gpu'], - r['decode_tp'], - r['decode_ep'], - r['decode_dp_attention'], - r['decode_num_workers'], - r['num_decode_gpu'], - r['conc'], - f"{r['median_ttft'] * 1000:.4f}", - f"{r['median_tpot'] * 1000:.4f}", - f"{r['median_intvty']:.4f}", - f"{r['median_e2el']:.4f}", - f"{r['tput_per_gpu']:.4f}", - f"{r['output_tput_per_gpu']:.4f}", - f"{r['input_tput_per_gpu']:.4f}", + + single_node_rows = [ + [ + r['infmax_model_prefix'], + r['model'], + r['hw'].upper(), + r['framework'].upper(), + r['precision'].upper(), + r['isl'], + r['osl'], + r['tp'], + r['ep'], + r['dp_attention'], + r['conc'], + f"{r['median_ttft'] * 1000:.4f}", + f"{r['median_tpot'] * 1000:.4f}", + f"{r['median_intvty']:.4f}", + f"{r['median_e2el']:.4f}", + f"{r['tput_per_gpu']:.4f}", + f"{r['output_tput_per_gpu']:.4f}", + f"{r['input_tput_per_gpu']:.4f}", + ] + for r in single_node_results ] - for r in multinode_results - ] - print("## Multi-Node Results\n") - print(tabulate(multinode_rows, headers=multinode_headers, tablefmt="github")) + print("## Single-Node Results\n") + print(tabulate(single_node_rows, headers=single_node_headers, tablefmt="github")) + print("\n") + + if multinode_results: + multinode_results.sort(key=lambda r: (r['infmax_model_prefix'], r['hw'], r['framework'], r['precision'], r['isl'], + r['osl'], r['prefill_tp'], r['prefill_ep'], r['decode_tp'], r['decode_ep'], r['conc'])) + + multinode_headers = [ + MODEL, SERVED_MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL, + PREFILL_TP, PREFILL_EP, PREFILL_DP_ATTN, PREFILL_WORKERS, PREFILL_GPUS, + DECODE_TP, DECODE_EP, DECODE_DP_ATTN, DECODE_WORKERS, DECODE_GPUS, + CONC, TTFT, TPOT, INTERACTIVITY, E2EL, TPUT_PER_GPU, OUTPUT_TPUT_PER_GPU, INPUT_TPUT_PER_GPU + ] + + multinode_rows = [ + [ + r['infmax_model_prefix'], + r['model'], + r['hw'].upper(), + r['framework'].upper(), + r['precision'].upper(), + r['isl'], + r['osl'], + r['prefill_tp'], + r['prefill_ep'], + r['prefill_dp_attention'], + r['prefill_num_workers'], + r['num_prefill_gpu'], + r['decode_tp'], + r['decode_ep'], + r['decode_dp_attention'], + r['decode_num_workers'], + r['num_decode_gpu'], + r['conc'], + f"{r['median_ttft'] * 1000:.4f}", + f"{r['median_tpot'] * 1000:.4f}", + f"{r['median_intvty']:.4f}", + f"{r['median_e2el']:.4f}", + f"{r['tput_per_gpu']:.4f}", + f"{r['output_tput_per_gpu']:.4f}", + f"{r['input_tput_per_gpu']:.4f}", + ] + for r in multinode_results + ] + + print("## Multi-Node Results\n") + print(tabulate(multinode_rows, headers=multinode_headers, tablefmt="github")) + + +if __name__ == "__main__": + main() \ No newline at end of file From 22c8a2bd9202d5ee69f4679b1eda40ac7d9937d3 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Tue, 16 Dec 2025 21:43:56 -0800 Subject: [PATCH 188/214] Evals summary refactor 2 --- .github/workflows/collect-evals.yml | 1 + .github/workflows/collect-results.yml | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/collect-evals.yml b/.github/workflows/collect-evals.yml index 64bf603e8..4d6288be6 100644 --- a/.github/workflows/collect-evals.yml +++ b/.github/workflows/collect-evals.yml @@ -29,6 +29,7 @@ jobs: - name: Summarize evals run: | + pip install tabulate echo "## Eval Summary - ${{ inputs.result-prefix || 'all' }}" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY python3 utils/collect_eval_results.py eval_results/ ${{ inputs.result-prefix || 'all' }} >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/collect-results.yml b/.github/workflows/collect-results.yml index ccc2ce4e4..5bfbde52e 100644 --- a/.github/workflows/collect-results.yml +++ b/.github/workflows/collect-results.yml @@ -34,7 +34,9 @@ jobs: python3 utils/summarize.py results/ >> $GITHUB_STEP_SUMMARY - name: Aggregate results - run: python3 utils/collect_results.py results/ ${{ inputs.result-prefix || 'all' }} + run: | + pip install tabulate + python3 utils/collect_results.py results/ ${{ inputs.result-prefix || 'all' }} - name: Upload aggregated results uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0 From 8d12b35afc0b733c27fe3a0988ecba65bea4ed71 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Tue, 16 Dec 2025 22:16:40 -0800 Subject: [PATCH 189/214] Evals summary aesthetics --- .github/workflows/collect-evals.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/collect-evals.yml b/.github/workflows/collect-evals.yml index 4d6288be6..606117e79 100644 --- a/.github/workflows/collect-evals.yml +++ b/.github/workflows/collect-evals.yml @@ -30,7 +30,7 @@ jobs: - name: Summarize evals run: | pip install tabulate - echo "## Eval Summary - ${{ inputs.result-prefix || 'all' }}" >> $GITHUB_STEP_SUMMARY + echo "## Eval Summary" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY python3 utils/collect_eval_results.py eval_results/ ${{ inputs.result-prefix || 'all' }} >> $GITHUB_STEP_SUMMARY From d7a515a55c999603352013b28026272dec34979d Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Wed, 17 Dec 2025 18:18:35 -0800 Subject: [PATCH 190/214] TRT package fix, trt testing --- benchmarks/dsr1_fp8_h200_trt_slurm.sh | 3 ++- benchmarks/gptoss_fp4_h200_trt_slurm.sh | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh index e71cc8b0d..3a0498d6a 100644 --- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh @@ -54,7 +54,8 @@ fi set -x -MAX_NUM_TOKENS=$(( ($CONC+$ISL+64+63)/64*64 )) +MAX_NUM_TOKENS=$(( (CONC + ISL + 64 + 63) / 64 * 64 )) +MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 4096 ? MAX_MODEL_LEN : 4096 )) # Launch TRT-LLM server PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \ diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh index 26043d322..964fd0352 100644 --- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh @@ -40,7 +40,7 @@ print_iter_log: true stream_interval: 20 EOF -mpirun -n 1 --oversubscribe --allow-run-as-root \ +PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \ trtllm-serve $MODEL \ --max_batch_size $CONC \ --max_num_tokens 20000 \ From 25f71bd1997ebbbb5de418017f73942161690959 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Wed, 17 Dec 2025 18:42:58 -0800 Subject: [PATCH 191/214] trt testing 2 --- benchmarks/dsr1_fp4_b200_trt_slurm.sh | 1 + benchmarks/dsr1_fp8_b200_trt_slurm.sh | 1 + 2 files changed, 2 insertions(+) diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh index f4165b72a..e72d4dcd1 100644 --- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh @@ -85,6 +85,7 @@ fi set -x MAX_NUM_TOKENS=$(( ($CONC+$ISL+64+63)/64*64 )) +MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 4096 ? MAX_MODEL_LEN : 4096 )) # Launch TRT-LLM server mpirun -n 1 --oversubscribe --allow-run-as-root \ diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh index c77f5277f..f9ab48a10 100644 --- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh @@ -55,6 +55,7 @@ fi set -x MAX_NUM_TOKENS=$(( ($CONC+$ISL+64+63)/64*64 )) +MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 4096 ? MAX_MODEL_LEN : 4096 )) # Launch TRT-LLM server mpirun -n 1 --oversubscribe --allow-run-as-root \ From ab6bf8f18de9063f58b113931ee4bec227e443f2 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Thu, 18 Dec 2025 21:40:32 -0800 Subject: [PATCH 192/214] max_num_tokens --- benchmarks/dsr1_fp4_b200_trt_slurm.sh | 1 + benchmarks/dsr1_fp8_b200_trt_slurm.sh | 1 + benchmarks/dsr1_fp8_h200_trt_slurm.sh | 1 + 3 files changed, 3 insertions(+) diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh index e72d4dcd1..a0902ad46 100644 --- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh @@ -86,6 +86,7 @@ set -x MAX_NUM_TOKENS=$(( ($CONC+$ISL+64+63)/64*64 )) MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 4096 ? MAX_MODEL_LEN : 4096 )) +MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 4096 ? MAX_NUM_TOKENS : 4096 )) # Launch TRT-LLM server mpirun -n 1 --oversubscribe --allow-run-as-root \ diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh index f9ab48a10..83d3c74a6 100644 --- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh @@ -56,6 +56,7 @@ set -x MAX_NUM_TOKENS=$(( ($CONC+$ISL+64+63)/64*64 )) MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 4096 ? MAX_MODEL_LEN : 4096 )) +MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 4096 ? MAX_NUM_TOKENS : 4096 )) # Launch TRT-LLM server mpirun -n 1 --oversubscribe --allow-run-as-root \ diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh index 3a0498d6a..a8ee33776 100644 --- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh @@ -56,6 +56,7 @@ set -x MAX_NUM_TOKENS=$(( (CONC + ISL + 64 + 63) / 64 * 64 )) MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 4096 ? MAX_MODEL_LEN : 4096 )) +MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 4096 ? MAX_NUM_TOKENS : 4096 )) # Launch TRT-LLM server PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \ From 9a873c4b67493569f4a56791affc392aafeaa494 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Wed, 7 Jan 2026 20:11:54 -0800 Subject: [PATCH 193/214] unbounded gen len --- benchmarks/benchmark_lib.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 1e25a8421..8ad96299b 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -310,7 +310,7 @@ run_lm_eval() { --tasks "utils/evals/${task}.yaml" \ --num_fewshot "${num_fewshot}" \ --output_path "${results_dir}" \ - --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=,max_retries=2,num_concurrent=${concurrent_requests},tokenized_requests=False" \ + --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=,max_retries=2,num_concurrent=${concurrent_requests},tokenized_requests=False,max_length=32768" \ --gen_kwargs "max_tokens=${gen_max_tokens},temperature=${temperature},top_p=${top_p}" set +x } From 999b9f67d8c020ef5f53a8d006f82fcd7f5d59ce Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Wed, 7 Jan 2026 20:45:51 -0800 Subject: [PATCH 194/214] Fix tmpl args, add isl/osl to table --- .github/workflows/benchmark-tmpl.yml | 1 + utils/collect_eval_results.py | 12 ++++++++---- utils/matrix_logic/generate_sweep_configs.py | 14 ++++++++++---- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 976e1d08d..73feb7ee2 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -53,6 +53,7 @@ on: run-eval: type: boolean required: true + default: false random-range-ratio: required: false type: string diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py index bb089d519..3116406e8 100644 --- a/utils/collect_eval_results.py +++ b/utils/collect_eval_results.py @@ -8,7 +8,7 @@ # Import shared utilities from summarize sys.path.insert(0, str(Path(__file__).resolve().parent)) from summarize import ( - load_json, MODEL, HARDWARE, FRAMEWORK, PRECISION, + load_json, MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL, TP, EP, CONC, DP_ATTENTION, TASK, EM_STRICT, EM_FLEXIBLE, N_EFF ) @@ -207,6 +207,8 @@ def main(): 'hw': meta.get('hw', 'unknown').upper(), 'framework': meta.get('framework', 'unknown').lower(), 'precision': meta.get('precision', 'unknown').lower(), + 'isl': int(meta.get('isl', 0)), + 'osl': int(meta.get('osl', 0)), 'tp': int(meta.get('tp', 1)), 'ep': int(meta.get('ep', 1)), 'conc': int(meta.get('conc', 0)), @@ -223,7 +225,7 @@ def main(): # Sort for stable output rows.sort(key=lambda r: ( - r['hw'], r['framework'], r['precision'], r['tp'], r['conc'] + r['hw'], r['framework'], r['precision'], r['isl'], r['osl'], r['tp'], r['ep'], r['conc'] )) if not rows: @@ -231,7 +233,7 @@ def main(): else: # Print table using tabulate headers = [ - MODEL, HARDWARE, FRAMEWORK, PRECISION, TP, EP, CONC, DP_ATTENTION, + MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL, TP, EP, CONC, DP_ATTENTION, TASK, EM_STRICT, EM_FLEXIBLE, N_EFF ] @@ -241,6 +243,8 @@ def main(): r['hw'], r['framework'].upper(), r['precision'].upper(), + r['isl'], + r['osl'], r['tp'], r['ep'], r['conc'], @@ -262,4 +266,4 @@ def main(): if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py index db6826079..da81685a7 100644 --- a/utils/matrix_logic/generate_sweep_configs.py +++ b/utils/matrix_logic/generate_sweep_configs.py @@ -41,15 +41,21 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]: from collections import defaultdict # Group entries by (model, runner, framework, precision, isl, osl) - # This ensures we compare within the same configuration, not across different frameworks + # Only include entries that have a top-level TP (i.e., single-node schema). + # This avoids relying on structural hints like prefill/decode which may be + # reused by future single-node disaggregated modes. groups = defaultdict(list) for i, entry in enumerate(matrix_values): + # Skip entries without a top-level TP field + if Fields.TP.value not in entry: + continue + key = ( - entry[Fields.MODEL.value], - entry[Fields.RUNNER.value], + entry[Fields.MODEL.value], + entry[Fields.RUNNER.value], entry[Fields.FRAMEWORK.value], entry[Fields.PRECISION.value], - entry[Fields.ISL.value], + entry[Fields.ISL.value], entry[Fields.OSL.value] ) groups[key].append((i, entry)) From 9a132501360035dc54c8cd1dea148ba67979181d Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Wed, 7 Jan 2026 22:17:54 -0800 Subject: [PATCH 195/214] add isl/osl --- .github/workflows/benchmark-tmpl.yml | 1 - benchmarks/benchmark_lib.sh | 4 +++- benchmarks/dsr1_fp4_b200_slurm.sh | 6 ++++++ benchmarks/dsr1_fp8_b200_slurm.sh | 9 ++++++++- benchmarks/gptoss_fp4_b200_slurm.sh | 7 +++++++ benchmarks/gptoss_fp4_b200_trt_docker.sh | 9 ++++++++- 6 files changed, 32 insertions(+), 4 deletions(-) diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 73feb7ee2..976e1d08d 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -53,7 +53,6 @@ on: run-eval: type: boolean required: true - default: false random-range-ratio: required: false type: string diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 8ad96299b..1c1fc5398 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -357,7 +357,9 @@ append_lm_eval_summary() { "ep": ${EP_SIZE:-1}, "dp_attention": ${dp_json}, "model": "${model_name:-}", - "hw": "${RUNNER_TYPE:-unknown}" + "hw": "${RUNNER_TYPE:-unknown}", + "isl": "${ISL:-0}", + "osl": "${OSL:-0}" } META diff --git a/benchmarks/dsr1_fp4_b200_slurm.sh b/benchmarks/dsr1_fp4_b200_slurm.sh index 570d39792..730404602 100644 --- a/benchmarks/dsr1_fp4_b200_slurm.sh +++ b/benchmarks/dsr1_fp4_b200_slurm.sh @@ -57,3 +57,9 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + append_lm_eval_summary +fi +set +x diff --git a/benchmarks/dsr1_fp8_b200_slurm.sh b/benchmarks/dsr1_fp8_b200_slurm.sh index 71532f816..e6d107661 100644 --- a/benchmarks/dsr1_fp8_b200_slurm.sh +++ b/benchmarks/dsr1_fp8_b200_slurm.sh @@ -57,4 +57,11 @@ run_benchmark_serving \ --num-prompts $(( $CONC * 10 )) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ No newline at end of file + --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + append_lm_eval_summary +fi +set +x diff --git a/benchmarks/gptoss_fp4_b200_slurm.sh b/benchmarks/gptoss_fp4_b200_slurm.sh index 6890a2191..c790a9ca8 100644 --- a/benchmarks/gptoss_fp4_b200_slurm.sh +++ b/benchmarks/gptoss_fp4_b200_slurm.sh @@ -68,3 +68,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + append_lm_eval_summary +fi +set +x diff --git a/benchmarks/gptoss_fp4_b200_trt_docker.sh b/benchmarks/gptoss_fp4_b200_trt_docker.sh index 1f5fbe868..61ffe6318 100644 --- a/benchmarks/gptoss_fp4_b200_trt_docker.sh +++ b/benchmarks/gptoss_fp4_b200_trt_docker.sh @@ -87,4 +87,11 @@ run_benchmark_serving \ --num-prompts "$NUM_PROMPTS" \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ No newline at end of file + --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + append_lm_eval_summary +fi +set +x From 4b0f8ded247ddebba343ca5367051fcaba1e3817 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Mon, 12 Jan 2026 13:21:16 -0800 Subject: [PATCH 196/214] set max tokens --- benchmarks/benchmark_lib.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 1c1fc5398..ffc92000a 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -275,7 +275,7 @@ run_lm_eval() { local task="${EVAL_TASK:-gsm8k}" local num_fewshot="${NUM_FEWSHOT:-2}" local results_dir="${EVAL_RESULT_DIR:-$(mktemp -d /tmp/eval_out-XXXXXX)}" - local gen_max_tokens=4096 + local gen_max_tokens=16384 local temperature=0 local top_p=1 local concurrent_requests=32 @@ -310,7 +310,7 @@ run_lm_eval() { --tasks "utils/evals/${task}.yaml" \ --num_fewshot "${num_fewshot}" \ --output_path "${results_dir}" \ - --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=,max_retries=2,num_concurrent=${concurrent_requests},tokenized_requests=False,max_length=32768" \ + --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=,max_retries=2,num_concurrent=${concurrent_requests},tokenized_requests=False,max_length=${gen_max_tokens}" \ --gen_kwargs "max_tokens=${gen_max_tokens},temperature=${temperature},top_p=${top_p}" set +x } From a52f4c6b2912b032d3adbba9025720efb5e3f037 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Mon, 12 Jan 2026 13:30:37 -0800 Subject: [PATCH 197/214] remove nvd --- .github/configs/runners.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml index e414af03e..458d2a7dc 100644 --- a/.github/configs/runners.yaml +++ b/.github/configs/runners.yaml @@ -21,10 +21,6 @@ b200-trt: - 'b200-nb_1' b200: # Docker-only nodes -- 'b200-nvd_0' -- 'b200-nvd_1' -- 'b200-nvd_2' -- 'b200-nvd_3' - 'b200-dgxc_1' - 'b200-dgxc_2' # Slurm nodes (also have b200 label, can run docker workloads) From 568e1d3d408fb13c131ae0e09890e6679e2d8968 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Mon, 12 Jan 2026 16:25:33 -0800 Subject: [PATCH 198/214] In case of multiple evals --- .github/workflows/benchmark-tmpl.yml | 1 + benchmarks/benchmark_lib.sh | 2 +- utils/collect_eval_results.py | 270 +++++++++++++++------------ utils/summarize.py | 1 + 4 files changed, 158 insertions(+), 116 deletions(-) diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 976e1d08d..73feb7ee2 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -53,6 +53,7 @@ on: run-eval: type: boolean required: true + default: false random-range-ratio: required: false type: string diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index ffc92000a..27a3aea60 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -632,7 +632,7 @@ run_lighteval_eval() { _install_lighteval_deps _patch_lighteval_litellm - # Prefer OPENAI_MODEL_NAME, then EVAL_MODEL_NAME, then MODEL + # For lighteval, MODEL_NAME MUST BE SET local model_name="${MODEL_NAME}" if [[ -z "$model_name" ]]; then echo "Error: MODEL not set for lighteval." >&2 diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py index 3116406e8..11752bef3 100644 --- a/utils/collect_eval_results.py +++ b/utils/collect_eval_results.py @@ -9,7 +9,7 @@ sys.path.insert(0, str(Path(__file__).resolve().parent)) from summarize import ( load_json, MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL, - TP, EP, CONC, DP_ATTENTION, TASK, EM_STRICT, EM_FLEXIBLE, N_EFF + TP, EP, CONC, DP_ATTENTION, TASK, SCORE, EM_STRICT, EM_FLEXIBLE, N_EFF ) @@ -57,11 +57,13 @@ def detect_eval_jsons(d: Path) -> Tuple[Optional[Path], Optional[Path]]: return lm_path, le_path -def extract_lm_metrics(json_path: Path) -> Dict[str, Any]: +def extract_lm_metrics(json_path: Path) -> List[Dict[str, Any]]: """Extract metrics from lm-eval harness result JSON. - + + Returns a list of metric dicts, one per task in the results. + Uses explicit structure from the JSON file: - - Task name from results keys + - Task names from results keys - Metric name from configs.metric_list - Filter names from configs.filter_list - Values from results[task][metric,filter] @@ -69,96 +71,111 @@ def extract_lm_metrics(json_path: Path) -> Dict[str, Any]: data = load_json(json_path) or {} results = data.get('results', {}) configs = data.get('configs', {}) - + if not results: - return {} - - # 1. Task: first key from results - task = next(iter(results.keys())) - - # 2. Base metric: from config's metric_list - metric_list = configs.get(task, {}).get('metric_list', []) - base_metric = metric_list[0]['metric'] if metric_list else 'exact_match' - - # 3. Filters: from config's filter_list - filter_list = configs.get(task, {}).get('filter_list', []) - - strict_val, strict_se = None, None - flex_val, flex_se = None, None - - # Helper to get value/stderr pair for filtered metrics - def get_val_se(filter_name: str) -> Tuple[Optional[float], Optional[float]]: - val_key = f"{base_metric},{filter_name}" - se_key = f"{base_metric}_stderr,{filter_name}" - return results[task].get(val_key), results[task].get(se_key) - - # Extract metrics based on filter_list - if not filter_list: - # No filters - use base metric for strict - strict_val = results[task].get(base_metric) - strict_se = results[task].get(f"{base_metric}_stderr") - else: - # Extract metrics for each filter - for f in filter_list: - fname = f['name'] - if 'strict' in fname: - strict_val, strict_se = get_val_se(fname) - elif 'flex' in fname or 'extract' in fname: - flex_val, flex_se = get_val_se(fname) - - # N-samples (effective count) - n_eff = data.get('n-samples', {}).get(task, {}).get('effective') - - # Model name - model = ( - data.get('model_name') - or configs.get(task, {}).get('metadata', {}).get('model') - ) - - return { - 'task': task, - 'strict': strict_val, - 'strict_se': strict_se, - 'flex': flex_val, - 'flex_se': flex_se, - 'n_eff': n_eff, - 'model': model, - 'source': str(json_path) - } + return [] + + extracted = [] + + for task in results.keys(): + task_results = results[task] + task_config = configs.get(task, {}) + + # Base metric: from config's metric_list + metric_list = task_config.get('metric_list', []) + base_metric = metric_list[0]['metric'] if metric_list else 'exact_match' + + # Filters: from config's filter_list + filter_list = task_config.get('filter_list', []) + + strict_val, strict_se = None, None + flex_val, flex_se = None, None + accuracy_val, accuracy_se = None, None + + # Helper to get value/stderr pair for filtered metrics + def get_val_se(filter_name: str) -> Tuple[Optional[float], Optional[float]]: + val_key = f"{base_metric},{filter_name}" + se_key = f"{base_metric}_stderr,{filter_name}" + return task_results.get(val_key), task_results.get(se_key) + + # Extract metrics based on filter_list + if not filter_list: + # No filters - check for accuracy or use base metric + if 'acc' in task_results: + accuracy_val = task_results.get('acc') + accuracy_se = task_results.get('acc_stderr') + else: + strict_val = task_results.get(base_metric) + strict_se = task_results.get(f"{base_metric}_stderr") + else: + # Extract metrics for each filter + for f in filter_list: + fname = f['name'] + if 'strict' in fname: + strict_val, strict_se = get_val_se(fname) + elif 'flex' in fname or 'extract' in fname: + flex_val, flex_se = get_val_se(fname) + + # N-samples (effective count) + n_eff = data.get('n-samples', {}).get(task, {}).get('effective') + # Model name + model = ( + data.get('model_name') + or task_config.get('metadata', {}).get('model') + ) -def extract_lighteval_metrics(json_path: Path, task_base: Optional[str] = None) -> Dict[str, Any]: - """Extract metrics from lighteval result JSON.""" + extracted.append({ + 'task': task, + 'strict': strict_val, + 'strict_se': strict_se, + 'flex': flex_val, + 'flex_se': flex_se, + 'accuracy': accuracy_val, + 'accuracy_se': accuracy_se, + 'n_eff': n_eff, + 'model': model, + 'source': str(json_path) + }) + + return extracted + + +def extract_lighteval_metrics(json_path: Path) -> List[Dict[str, Any]]: + """Extract metrics from lighteval result JSON. + + Returns a list of metric dicts, one per task in the results. + """ data = load_json(json_path) or {} results = data.get('results', {}) or {} - - # Find task key - key = None - if task_base: - for k in results.keys(): - if str(k).startswith(task_base): - key = k - break - if key is None: - key = next(iter(results.keys())) if results else 'unknown' - - r = results.get(key, {}) - em = r.get('extractive_match') - em_se = r.get('extractive_match_stderr') + + if not results: + return [] cg = data.get('config_general', {}) or {} model = cg.get('model_name') or cg.get('model_config', {}).get('model_name', '') - return { - 'task': key, - 'strict': em, - 'flex': None, - 'strict_se': em_se, - 'flex_se': None, - 'n_eff': None, - 'model': model, - 'source': str(json_path) - } + extracted = [] + + for task in results.keys(): + r = results.get(task, {}) + em = r.get('extractive_match') + em_se = r.get('extractive_match_stderr') + + extracted.append({ + 'task': task, + 'strict': em, + 'strict_se': em_se, + 'flex': None, + 'flex_se': None, + 'accuracy': None, + 'accuracy_se': None, + 'n_eff': None, + 'model': model, + 'source': str(json_path) + }) + + return extracted def pct(x: Any) -> str: @@ -177,6 +194,45 @@ def se(x: Any) -> str: return '' +def build_row(meta: Dict[str, Any], m: Dict[str, Any]) -> Dict[str, Any]: + """Build a result row from metadata and extracted metrics.""" + row = { + 'model': m.get('model') or meta.get('model', 'unknown'), + 'hw': meta.get('hw', 'unknown').upper(), + 'framework': meta.get('framework', 'unknown').lower(), + 'precision': meta.get('precision', 'unknown').lower(), + 'isl': int(meta.get('isl', 0)), + 'osl': int(meta.get('osl', 0)), + 'tp': int(meta.get('tp', 1)), + 'ep': int(meta.get('ep', 1)), + 'conc': int(meta.get('conc', 0)), + 'dp_attention': str(meta.get('dp_attention', False)).lower(), + 'task': m.get('task', 'unknown'), + 'em_strict': m.get('strict'), + 'em_strict_se': m.get('strict_se'), + 'em_flexible': m.get('flex'), + 'em_flexible_se': m.get('flex_se'), + 'n_eff': m.get('n_eff'), + 'source': m.get('source'), + } + + # Add universal score field (primary metric for unified comparison) + if m.get('strict') is not None: + row['score'] = m.get('strict') + row['score_name'] = 'em_strict' + row['score_se'] = m.get('strict_se') + elif m.get('accuracy') is not None: + row['score'] = m.get('accuracy') + row['score_name'] = 'accuracy' + row['score_se'] = m.get('accuracy_se') + else: + row['score'] = None + row['score_name'] = None + row['score_se'] = None + + return row + + def main(): if len(sys.argv) < 3: print('Usage: collect_eval_results.py ') @@ -189,39 +245,22 @@ def main(): for d in find_eval_sets(root): meta = load_json(d / 'meta_env.json') or {} lm_path, le_path = detect_eval_jsons(d) - - # Extract metrics (prefer lm-eval) + + # Extract metrics (prefer lm-eval) - returns list for multi-task support if lm_path: - m = extract_lm_metrics(lm_path) + metrics_list = extract_lm_metrics(lm_path) elif le_path: - m = extract_lighteval_metrics(le_path) + metrics_list = extract_lighteval_metrics(le_path) else: continue - if not m: + if not metrics_list: continue - # Build row from meta + metrics - row = { - 'model': m.get('model') or meta.get('model', 'unknown'), - 'hw': meta.get('hw', 'unknown').upper(), - 'framework': meta.get('framework', 'unknown').lower(), - 'precision': meta.get('precision', 'unknown').lower(), - 'isl': int(meta.get('isl', 0)), - 'osl': int(meta.get('osl', 0)), - 'tp': int(meta.get('tp', 1)), - 'ep': int(meta.get('ep', 1)), - 'conc': int(meta.get('conc', 0)), - 'dp_attention': str(meta.get('dp_attention', False)).lower(), - 'task': m.get('task', 'unknown'), - 'em_strict': m.get('strict'), - 'em_strict_se': m.get('strict_se'), - 'em_flexible': m.get('flex'), - 'em_flexible_se': m.get('flex_se'), - 'n_eff': m.get('n_eff'), - 'source': m.get('source'), - } - rows.append(row) + # Build row for each task in the results + for m in metrics_list: + row = build_row(meta, m) + rows.append(row) # Sort for stable output rows.sort(key=lambda r: ( @@ -233,10 +272,10 @@ def main(): else: # Print table using tabulate headers = [ - MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL, TP, EP, CONC, DP_ATTENTION, - TASK, EM_STRICT, EM_FLEXIBLE, N_EFF + MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL, TP, EP, CONC, DP_ATTENTION, + TASK, SCORE, EM_STRICT, EM_FLEXIBLE, N_EFF ] - + table_rows = [ [ r['model'], @@ -250,13 +289,14 @@ def main(): r['conc'], r['dp_attention'], r['task'], + f"{pct(r['score'])}{se(r['score_se'])}", f"{pct(r['em_strict'])}{se(r['em_strict_se'])}", f"{pct(r['em_flexible'])}{se(r['em_flexible_se'])}", r['n_eff'] or '' ] for r in rows ] - + print(tabulate(table_rows, headers=headers, tablefmt="github")) # Write JSON aggregate diff --git a/utils/summarize.py b/utils/summarize.py index c40754ab7..5e248164f 100644 --- a/utils/summarize.py +++ b/utils/summarize.py @@ -36,6 +36,7 @@ # Eval constants TASK = "Task" +SCORE = "Score" EM_STRICT = "EM Strict" EM_FLEXIBLE = "EM Flexible" N_EFF = "N (eff)" From d55c79622c43151fe752fd277bc47fe982eee650 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Mon, 12 Jan 2026 21:06:05 -0800 Subject: [PATCH 199/214] diagnostic --- .github/workflows/benchmark-tmpl.yml | 1 + benchmarks/benchmark_lib.sh | 4 ++-- benchmarks/gptoss_fp4_b200_trt_docker.sh | 2 +- benchmarks/gptoss_fp4_b200_trt_slurm.sh | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 73feb7ee2..addd11521 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -195,6 +195,7 @@ jobs: path: | meta_env.json results*.json + sample*.jsonl if-no-files-found: ignore - name: Cleanup eval outputs (post-upload) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 27a3aea60..4d8148387 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -309,7 +309,7 @@ run_lm_eval() { python3 -m lm_eval --model local-chat-completions --apply_chat_template \ --tasks "utils/evals/${task}.yaml" \ --num_fewshot "${num_fewshot}" \ - --output_path "${results_dir}" \ + --output_path "${results_dir}" --log_samples \ --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=,max_retries=2,num_concurrent=${concurrent_requests},tokenized_requests=False,max_length=${gen_max_tokens}" \ --gen_kwargs "max_tokens=${gen_max_tokens},temperature=${temperature},top_p=${top_p}" set +x @@ -373,7 +373,7 @@ META if [ "$base" != "meta_env.json" ]; then mv -f "$jf" ./ || true fi - done < <(find "${out_dir}" -type f -name "*.json" -print0 2>/dev/null) + done < <(find "${out_dir}" -type f -name "*.json*" -print0 2>/dev/null) fi # Best-effort cleanup of the temp directory diff --git a/benchmarks/gptoss_fp4_b200_trt_docker.sh b/benchmarks/gptoss_fp4_b200_trt_docker.sh index 61ffe6318..64c0556cd 100644 --- a/benchmarks/gptoss_fp4_b200_trt_docker.sh +++ b/benchmarks/gptoss_fp4_b200_trt_docker.sh @@ -91,7 +91,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC )) append_lm_eval_summary fi set +x diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh index 37b6edf63..b82f562de 100644 --- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh @@ -106,7 +106,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC )) append_lm_eval_summary fi set +x From fcd14e228a1218c76a961b92220bbc1c7f82fbcb Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Tue, 13 Jan 2026 13:13:40 -0800 Subject: [PATCH 200/214] test dp_attn --- benchmarks/gptoss_fp4_b200_trt_slurm.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh index 954d7ca93..ce397265c 100644 --- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh @@ -29,6 +29,7 @@ PORT=$(( 8888 + $PORT_OFFSET )) # ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC ========= MOE_BACKEND="TRTLLM" +export DP_ATTENTION=false echo "MOE_BACKEND set to '$MOE_BACKEND'" From c9025452b80a921479c7d9e546c12cdf14894934 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Wed, 14 Jan 2026 11:48:12 -0800 Subject: [PATCH 201/214] DP_ATTENTION back --- benchmarks/gptoss_fp4_b200_trt_slurm.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh index ce397265c..954d7ca93 100644 --- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh @@ -29,7 +29,6 @@ PORT=$(( 8888 + $PORT_OFFSET )) # ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC ========= MOE_BACKEND="TRTLLM" -export DP_ATTENTION=false echo "MOE_BACKEND set to '$MOE_BACKEND'" From 715269c22031a2cea1621a26c38a63e24dbbc6b1 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Thu, 15 Jan 2026 09:26:21 -0800 Subject: [PATCH 202/214] REMOVE LIGHTEVAL --- benchmarks/benchmark_lib.sh | 290 ---------------------------------- utils/collect_eval_results.py | 45 +----- utils/evals/EVALS.md | 4 +- utils/evals/custom_gsm8k.py | 22 --- 4 files changed, 2 insertions(+), 359 deletions(-) delete mode 100644 utils/evals/custom_gsm8k.py diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 4d8148387..ec33311b1 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -384,295 +384,6 @@ META echo "Moved eval artifacts to: $(pwd)" } -# ------------------------------ -# Lighteval + LiteLLM patching -# ------------------------------ - -_install_lighteval_deps() { - python3 -m pip install -q --no-cache-dir "lighteval==0.13.0" "litellm==1.80.7" || true -} - -# Patch lighteval's LiteLLMClient to handle reasoning content and Python name mangling -# 1. Removed "response_format": {"type": "text"}, as it interferred with vllm endpoint -# 2. Concat reasoning with output tokens as sometimes the output is empty. -_patch_lighteval_litellm() { - local patch_dir - patch_dir="$(mktemp -d)" - cat > "$patch_dir/sitecustomize.py" <<'PY' -import logging -import os -import time -import re -from concurrent.futures import ThreadPoolExecutor, as_completed - -import litellm -from tqdm import tqdm - -litellm.suppress_debug_info = True -litellm.drop_params = True - -# Remove sglang import that crashes -try: - # This is where lighteval's is_package_available lives - from lighteval.utils import imports as le_imports -except Exception: - le_imports = None -else: - _orig_is_package_available = le_imports.is_package_available - - def _patched_is_package_available(pkg: str) -> bool: - # Force "sglang" to look unavailable so that - # lighteval.models.sglang.sglang_model never imports `sglang` - if pkg == "sglang": - return False - return _orig_is_package_available(pkg) - - le_imports.is_package_available = _patched_is_package_available - -from lighteval.models.endpoints.litellm_model import LiteLLMClient -from lighteval.data import GenerativeTaskDataset -from lighteval.tasks.requests import Doc -from lighteval.models.model_output import ModelResponse - -logger = logging.getLogger(__name__) - -def _patched___call_api(self, prompt, return_logits, max_new_tokens, num_samples, stop_sequence): # noqa: C901, N802 - from lighteval.models.endpoints.litellm_model import LitellmModelResponse - response = LitellmModelResponse() - # Keep dataset-provided stop sequences to cut early - max_new_tokens = self._prepare_max_new_tokens(max_new_tokens) - - if return_logits and not self.provider == "openai": - logger.warning("Returning logits is not supported for this provider, ignoring.") - - kwargs = { - "model": self.model, - "messages": prompt, - "max_tokens": max_new_tokens, - "logprobs": return_logits if self.provider == "openai" else None, - "stop": stop_sequence, - "base_url": self.base_url, - "api_key": self.api_key, - "n": num_samples, - "timeout": self.timeout, - } - - # vLLM/SGLang OpenAI servers: apply chat template and start assistant turn - if ( - self.provider == "openai" - and isinstance(self.base_url, str) - and self.base_url - and ("api.openai.com" not in self.base_url) - ): - kwargs["extra_body"] = {"use_chat_template": True, "add_generation_prompt": True} - - if "o1" in self.model: - logger.warning("O1 models do not support temperature, top_p, stop sequence. Disabling.") - else: - kwargs.update(self.generation_parameters.to_litellm_dict()) - - if kwargs.get("max_completion_tokens", None) is None: - kwargs["max_completion_tokens"] = max_new_tokens - - for attempt in range(self.API_MAX_RETRY): - try: - response = litellm.completion(**kwargs) - msg = response.choices[0].message - content = getattr(msg, "content", None) - reasoning = getattr(msg, "reasoning_content", None) - - # Accept reasoning-only replies - if (not content) and reasoning: - return response - - return response - except litellm.BadRequestError as e: - if "message" in e.__dict__ and "policy" in e.__dict__["message"]: - logger.warning("Content filtered. Returning empty response.") - return LitellmModelResponse() - except Exception as e: - wait_time = min(64, self.API_RETRY_SLEEP * (self.API_RETRY_MULTIPLIER**attempt)) - logger.warning(f"Error: {e}, waiting {wait_time}s before retry {attempt + 1}/{self.API_MAX_RETRY}") - time.sleep(wait_time) - - logger.error(f"API call failed after {self.API_MAX_RETRY} attempts.") - return LitellmModelResponse() - - -def _patched___call_api_parallel(self, prompts, return_logits, max_new_tokens, num_samples, stop_sequence): # noqa: N802 - # Build per-item args - return_logitss = [return_logits for _ in prompts] if not isinstance(return_logits, list) else return_logits - max_new_tokenss = [max_new_tokens for _ in prompts] if not isinstance(max_new_tokens, list) else max_new_tokens - num_sampless = [num_samples for _ in prompts] if not isinstance(num_samples, list) else num_samples - stop_sequencess = [stop_sequence for _ in prompts] - - n = len(prompts) - assert n == len(return_logitss) == len(max_new_tokenss) == len(num_sampless) == len(stop_sequencess), ( - f"Length mismatch: {len(prompts)}, {len(return_logitss)}, {len(max_new_tokenss)}, " - f"{len(num_sampless)}, {len(stop_sequencess)}" - ) - - results = [None] * n - with ThreadPoolExecutor(self.concurrent_requests) as executor: - futures = [] - for idx in range(n): - fut = executor.submit( - self._LiteLLMClient__call_api, - prompts[idx], - return_logitss[idx], - max_new_tokenss[idx], - num_sampless[idx], - stop_sequencess[idx], - ) - fut._le_idx = idx # attach index for order restoration - futures.append(fut) - - for fut in tqdm(as_completed(futures), total=n, disable=self.disable_tqdm): - idx = getattr(fut, "_le_idx", None) - try: - res = fut.result() - except Exception: - res = None - if idx is not None: - results[idx] = res - - if any(r is None for r in results): - raise ValueError("Some entries are not annotated due to errors in __call_api_parallel, please inspect and retry.") - - return results - - -def _greedy_until_impl(self, docs: list[Doc]) -> list[ModelResponse]: - dataset = GenerativeTaskDataset(requests=docs, num_dataset_splits=self.DATASET_SPLITS) - results: list[ModelResponse] = [] - - for split in tqdm( - dataset.splits_iterator(), - total=dataset.num_dataset_splits, - desc="Splits", - position=0, - disable=self.disable_tqdm, - ): - contexts = [self.prompt_manager.prepare_prompt_api(doc) for doc in split] - - max_new_tokens = split[0].generation_size - return_logits = split[0].use_logits - num_samples = split[0].num_samples - stop_sequence = split[0].stop_sequences - - if num_samples > 1 and self.generation_parameters.temperature == 0: - raise ValueError("num_samples > 1 requires temperature > 0") - - responses = self._LiteLLMClient__call_api_parallel( - contexts, - return_logits, - max_new_tokens, - num_samples, - stop_sequence, - ) - - for response, context in zip(responses, contexts): - merged_texts: list[str] = [] - reasonings: list[str | None] = [] - - for choice in response.choices: - msg = choice.message - raw_content = getattr(msg, "content", None) or "" - reasoning = getattr(msg, "reasoning_content", None) - - # For answer extraction, use only the content field - # The reasoning is stored separately for logging/debugging - merged_texts.append(raw_content.strip() if raw_content else "") - reasonings.append(reasoning if reasoning else None) - - if not merged_texts or merged_texts[0] is None: - merged_texts = [""] - - results.append( - ModelResponse( - text=merged_texts, - reasonings=reasonings, - input=context, - ) - ) - - if len(results) != len(dataset): - raise RuntimeError(f"Internal mismatch: {len(results)} outputs vs {len(dataset)} docs.") - - return dataset.get_original_order(results) - -# Bind patches -LiteLLMClient._LiteLLMClient__call_api = _patched___call_api -LiteLLMClient._LiteLLMClient__call_api_parallel = _patched___call_api_parallel -#LiteLLMClient.greedy_until = _greedy_until_impl -PY - export PYTHONPATH="${patch_dir}:${PYTHONPATH:-}" -} - -run_lighteval_eval() { - local port="${PORT:-8888}" - local task="${EVAL_TASK:-gsm8k}" - local num_fewshot="${NUM_FEWSHOT:-5}" - local results_dir="${EVAL_RESULT_DIR:-$(mktemp -d /tmp/eval_out-XXXXXX)}" - local max_samples=0 - local concurrent_requests=32 - - while [[ $# -gt 0 ]]; do - case $1 in - --port) port="$2"; shift 2 ;; - --task) task="$2"; shift 2 ;; - --num-fewshot) num_fewshot="$2"; shift 2 ;; - --results-dir) results_dir="$2"; shift 2 ;; - --max-samples) max_samples="$2"; shift 2 ;; - --concurrent-requests) concurrent_requests="$2"; shift 2 ;; - *) echo "Unknown parameter: $1"; return 1 ;; - esac - done - - _install_lighteval_deps - _patch_lighteval_litellm - - # For lighteval, MODEL_NAME MUST BE SET - local model_name="${MODEL_NAME}" - if [[ -z "$model_name" ]]; then - echo "Error: MODEL not set for lighteval." >&2 - return 1 - fi - - # LiteLLM provider prefix logic - local lite_model="$model_name" - if [[ "$lite_model" != openai/* ]]; then - lite_model="openai/${lite_model}" - fi - - local base_url="http://0.0.0.0:${port}/v1" - export OPENAI_API_KEY="${OPENAI_API_KEY:-EMPTY}" - - local MODEL_ARGS="model_name=${lite_model},base_url=${base_url},api_key=${OPENAI_API_KEY},generation_parameters={temperature:0.0,top_p:1,max_new_tokens:2048},concurrent_requests=${concurrent_requests}" - local TASK_SPEC="${task}|${num_fewshot}" - - # Respect absolute paths (e.g., /tmp/eval_out); otherwise write under /workspace - local output_dir - if [[ "$results_dir" = /* ]]; then - output_dir="$results_dir" - else - output_dir="/workspace/${results_dir}" - fi - - # Make output dir visible to append_lm_eval_summary - export EVAL_RESULT_DIR="$output_dir" - - set -x - lighteval endpoint litellm \ - "${MODEL_ARGS}" \ - "${TASK_SPEC}" \ - --output-dir "${output_dir}" \ - --custom-tasks utils/evals/custom_gsm8k.py \ - --max-samples "${max_samples}" - set +x -} - - # ------------------------------ # Unified eval entrypoint # ------------------------------ @@ -690,7 +401,6 @@ run_eval() { case "$framework" in lm-eval|lm_eval) run_lm_eval "${forwarded[@]}" ;; - lighteval) run_lighteval_eval "${forwarded[@]}" ;; *) echo "Unknown framework '${framework}'"; return 1 ;; esac } diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py index 11752bef3..5ffaa0cc9 100644 --- a/utils/collect_eval_results.py +++ b/utils/collect_eval_results.py @@ -29,7 +29,7 @@ def find_eval_sets(root: Path) -> List[Path]: def detect_eval_jsons(d: Path) -> Tuple[Optional[Path], Optional[Path]]: - """Return (lm_eval_json, lighteval_json) if present. + """Return (lm_eval_json) if present. Checks immediate directory for result JSONs. """ @@ -49,10 +49,6 @@ def detect_eval_jsons(d: Path) -> Tuple[Optional[Path], Optional[Path]]: # lm-eval harness - pick latest if multiple if lm_path is None or p.stat().st_mtime > lm_path.stat().st_mtime: lm_path = p - elif 'config_general' in data and 'results' in data: - # lighteval - pick latest if multiple - if le_path is None or p.stat().st_mtime > le_path.stat().st_mtime: - le_path = p return lm_path, le_path @@ -141,43 +137,6 @@ def get_val_se(filter_name: str) -> Tuple[Optional[float], Optional[float]]: return extracted -def extract_lighteval_metrics(json_path: Path) -> List[Dict[str, Any]]: - """Extract metrics from lighteval result JSON. - - Returns a list of metric dicts, one per task in the results. - """ - data = load_json(json_path) or {} - results = data.get('results', {}) or {} - - if not results: - return [] - - cg = data.get('config_general', {}) or {} - model = cg.get('model_name') or cg.get('model_config', {}).get('model_name', '') - - extracted = [] - - for task in results.keys(): - r = results.get(task, {}) - em = r.get('extractive_match') - em_se = r.get('extractive_match_stderr') - - extracted.append({ - 'task': task, - 'strict': em, - 'strict_se': em_se, - 'flex': None, - 'flex_se': None, - 'accuracy': None, - 'accuracy_se': None, - 'n_eff': None, - 'model': model, - 'source': str(json_path) - }) - - return extracted - - def pct(x: Any) -> str: """Format value as percentage.""" try: @@ -249,8 +208,6 @@ def main(): # Extract metrics (prefer lm-eval) - returns list for multi-task support if lm_path: metrics_list = extract_lm_metrics(lm_path) - elif le_path: - metrics_list = extract_lighteval_metrics(le_path) else: continue diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md index 511c80804..fcdcd5360 100644 --- a/utils/evals/EVALS.md +++ b/utils/evals/EVALS.md @@ -15,14 +15,12 @@ To verify how model outputs are affected by throughput optimizations. - Check kernel implementations for correctness ## How? -- `run_eval`, definined in `benchmarks/benchmark_lib.sh`, is called in `benchmarks/*`. Either EleutherAI/lm-evaluation-harness(lmeval) or lighteval with litellm is ran, using the same endpoint as the throughput benchmark. JSON results are processed and converted to a table with `utils/collect_eval_results.py`. +- `run_eval`, definined in `benchmarks/benchmark_lib.sh`, is called in `benchmarks/*`. EleutherAI/lm-evaluation-harness(lmeval), using the same endpoint as the throughput benchmark. JSON results are processed and converted to a table with `utils/collect_eval_results.py`. ## Misc Following files are task definitions from lmeval, more info on changes within the files - `utils/evals/math500.yaml` - `utils/evals/gsm8k.yaml` -Following files are task definitions from lighteval, more info on changes within the files -- `utils/evals/custom_gsm8k.py` diff --git a/utils/evals/custom_gsm8k.py b/utils/evals/custom_gsm8k.py deleted file mode 100644 index ac6c0b9be..000000000 --- a/utils/evals/custom_gsm8k.py +++ /dev/null @@ -1,22 +0,0 @@ -# Copied from https://github.com/huggingface/lighteval/blob/99ef5b98d422cf3620eebec9db13285493d35542/src/lighteval/tasks/tasks/gsm8k.py -# Increases generation size to 768 from 256 to better accommodate longer solutions by dsr1. -from lighteval.metrics.metrics import Metrics -from lighteval.tasks.lighteval_task import LightevalTaskConfig -from lighteval.tasks.tasks.gsm8k import gsm8k_prompt - -gsm8k_long = LightevalTaskConfig( - name="gsm8k_long", - prompt_function=gsm8k_prompt, - hf_repo="openai/gsm8k", - hf_subset="main", - hf_avail_splits=["train", "test"], - evaluation_splits=["test"], - few_shots_split=None, - few_shots_select="random_sampling_from_train", - generation_size=1024, # raised this from 256 - metrics=[Metrics.expr_gold_metric], - stop_sequence=None, # avoid early stop on "Question:" - version=0, -) - -TASKS_TABLE = [gsm8k_long] From a353ea4ab54cd271aef019c0db49983b8eed457f Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Mon, 19 Jan 2026 22:37:17 -0800 Subject: [PATCH 203/214] Add evals for atom, trt_mtp --- benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh | 7 +++++++ benchmarks/dsr1_fp4_mi355x_atom_slurm.sh | 6 ++++++ benchmarks/dsr1_fp8_mi355x_atom_slurm.sh | 6 ++++++ benchmarks/gptoss_fp4_mi355x_atom_slurm.sh | 6 ++++++ 4 files changed, 25 insertions(+) diff --git a/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh index 33d819efa..104a33ca2 100644 --- a/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh +++ b/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh @@ -102,3 +102,10 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ \ --use-chat-template + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + append_lm_eval_summary +fi +set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp4_mi355x_atom_slurm.sh b/benchmarks/dsr1_fp4_mi355x_atom_slurm.sh index 8028ae449..c50273d60 100644 --- a/benchmarks/dsr1_fp4_mi355x_atom_slurm.sh +++ b/benchmarks/dsr1_fp4_mi355x_atom_slurm.sh @@ -65,3 +65,9 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + append_lm_eval_summary +fi +set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_mi355x_atom_slurm.sh b/benchmarks/dsr1_fp8_mi355x_atom_slurm.sh index 8028ae449..c50273d60 100644 --- a/benchmarks/dsr1_fp8_mi355x_atom_slurm.sh +++ b/benchmarks/dsr1_fp8_mi355x_atom_slurm.sh @@ -65,3 +65,9 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + append_lm_eval_summary +fi +set +x \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_mi355x_atom_slurm.sh b/benchmarks/gptoss_fp4_mi355x_atom_slurm.sh index 953505ee9..560e29df6 100644 --- a/benchmarks/gptoss_fp4_mi355x_atom_slurm.sh +++ b/benchmarks/gptoss_fp4_mi355x_atom_slurm.sh @@ -66,3 +66,9 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + append_lm_eval_summary +fi +set +x \ No newline at end of file From d6d4055b85b82267c3b81b0f0c4dbc12f1a90e12 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Mon, 19 Jan 2026 22:59:47 -0800 Subject: [PATCH 204/214] remove tokenizer from benchmarkserving --- benchmarks/dsr1_fp8_h200_slurm.sh | 4 ++-- benchmarks/gptoss_fp4_b200_docker.sh | 1 - benchmarks/gptoss_fp4_h100_docker.sh | 1 - benchmarks/gptoss_fp4_h100_slurm.sh | 1 - benchmarks/gptoss_fp4_h200_slurm.sh | 1 - benchmarks/gptoss_fp4_mi300x_docker.sh | 1 - benchmarks/gptoss_fp4_mi325x_slurm.sh | 1 - benchmarks/gptoss_fp4_mi355x_docker.sh | 1 - 8 files changed, 2 insertions(+), 9 deletions(-) diff --git a/benchmarks/dsr1_fp8_h200_slurm.sh b/benchmarks/dsr1_fp8_h200_slurm.sh index 41d649a74..657504290 100644 --- a/benchmarks/dsr1_fp8_h200_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_slurm.sh @@ -24,7 +24,7 @@ export TORCH_CUDA_ARCH_LIST="9.0" set -x if [[ $ISL -eq 1024 && $OSL -eq 1024 ]]; then - PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --tokenizer-path $MODEL \ + PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL \ --host 0.0.0.0 --port $PORT --trust-remote-code \ --tensor-parallel-size=$TP --data-parallel-size=1 \ --disable-radix-cache --max-running-requests 512 --cuda-graph-max-bs 512 \ @@ -33,7 +33,7 @@ if [[ $ISL -eq 1024 && $OSL -eq 1024 ]]; then --decode-log-interval 1 \ > $SERVER_LOG 2>&1 & else - PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --tokenizer-path $MODEL \ + PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL \ --host 0.0.0.0 --port $PORT --trust-remote-code \ --tensor-parallel-size=$TP --data-parallel-size=1 \ --disable-radix-cache --max-running-requests 256 --cuda-graph-max-bs 256 \ diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh index 44b772a9d..8949fbc93 100644 --- a/benchmarks/gptoss_fp4_b200_docker.sh +++ b/benchmarks/gptoss_fp4_b200_docker.sh @@ -66,7 +66,6 @@ pip install -q datasets pandas run_benchmark_serving \ --model "$MODEL_NAME" \ - --tokenizer "$MODEL" \ --port "$PORT" \ --backend vllm \ --input-len "$ISL" \ diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index 8851b0a0c..dead5fbc7 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -46,7 +46,6 @@ pip install -q datasets pandas run_benchmark_serving \ --model "$MODEL_NAME" \ - --tokenizer "$MODEL" \ --port "$PORT" \ --backend vllm \ --input-len "$ISL" \ diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh index 99e939c69..ac9d29b2e 100644 --- a/benchmarks/gptoss_fp4_h100_slurm.sh +++ b/benchmarks/gptoss_fp4_h100_slurm.sh @@ -47,7 +47,6 @@ pip install -q datasets pandas run_benchmark_serving \ --model "$MODEL_NAME" \ - --tokenizer "$MODEL" \ --port "$PORT" \ --backend vllm \ --input-len "$ISL" \ diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh index b28a00c3f..31689bd4a 100644 --- a/benchmarks/gptoss_fp4_h200_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_slurm.sh @@ -57,7 +57,6 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S run_benchmark_serving \ --model "$MODEL_NAME" \ - --tokenizer "$MODEL" \ --port "$PORT" \ --backend vllm \ --input-len "$ISL" \ diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh index c7a39e53f..b9fb586df 100644 --- a/benchmarks/gptoss_fp4_mi300x_docker.sh +++ b/benchmarks/gptoss_fp4_mi300x_docker.sh @@ -52,7 +52,6 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S run_benchmark_serving \ --model "$MODEL_NAME" \ - --tokenizer "$MODEL" \ --port "$PORT" \ --backend vllm \ --input-len "$ISL" \ diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh index 9eee1a9a3..ba8dd29ad 100644 --- a/benchmarks/gptoss_fp4_mi325x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh @@ -55,7 +55,6 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S run_benchmark_serving \ --model "$MODEL_NAME" \ - --tokenizer "$MODEL" \ --port "$PORT" \ --backend vllm \ --input-len "$ISL" \ diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh index eb26fd467..d04104268 100644 --- a/benchmarks/gptoss_fp4_mi355x_docker.sh +++ b/benchmarks/gptoss_fp4_mi355x_docker.sh @@ -49,7 +49,6 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S run_benchmark_serving \ --model "$MODEL_NAME" \ - --tokenizer "$MODEL" \ --port "$PORT" \ --backend vllm \ --input-len "$ISL" \ From 338d80cefb415ce3c0db0410e3c06eb083ab8cc8 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Mon, 19 Jan 2026 23:15:17 -0800 Subject: [PATCH 205/214] remove model_name --- benchmarks/gptoss_fp4_b200_docker.sh | 5 ++--- benchmarks/gptoss_fp4_h100_docker.sh | 5 ++--- benchmarks/gptoss_fp4_h100_slurm.sh | 6 +++--- benchmarks/gptoss_fp4_h200_slurm.sh | 6 +++--- benchmarks/gptoss_fp4_mi300x_docker.sh | 5 ++--- benchmarks/gptoss_fp4_mi325x_slurm.sh | 5 ++--- benchmarks/gptoss_fp4_mi355x_docker.sh | 5 ++--- 7 files changed, 16 insertions(+), 21 deletions(-) diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh index 8949fbc93..322f352c0 100644 --- a/benchmarks/gptoss_fp4_b200_docker.sh +++ b/benchmarks/gptoss_fp4_b200_docker.sh @@ -46,7 +46,6 @@ export PYTHONNOUSERSITE=1 export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) -MODEL_NAME=${MODEL##*/} set -x vllm serve $MODEL --host 0.0.0.0 --port $PORT \ @@ -55,7 +54,7 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT \ --tensor-parallel-size $TP \ --max-num-seqs 512 \ --disable-log-requests \ ---served-model-name $MODEL_NAME > $SERVER_LOG 2>&1 & +--served-model-name $MODEL > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -65,7 +64,7 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S pip install -q datasets pandas run_benchmark_serving \ - --model "$MODEL_NAME" \ + --model "$MODEL" \ --port "$PORT" \ --backend vllm \ --input-len "$ISL" \ diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index dead5fbc7..a4c848119 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -26,7 +26,6 @@ EOF export PYTHONNOUSERSITE=1 export VLLM_MXFP4_USE_MARLIN=1 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) -MODEL_NAME=${MODEL##*/} set -x vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ @@ -35,7 +34,7 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --tensor-parallel-size=$TP \ --max-num-seqs=$CONC \ --disable-log-requests \ ---served-model-name $MODEL_NAME > $SERVER_LOG 2>&1 & +--served-model-name $MODEL > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -45,7 +44,7 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S pip install -q datasets pandas run_benchmark_serving \ - --model "$MODEL_NAME" \ + --model "$MODEL" \ --port "$PORT" \ --backend vllm \ --input-len "$ISL" \ diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh index ac9d29b2e..2e44f95ed 100644 --- a/benchmarks/gptoss_fp4_h100_slurm.sh +++ b/benchmarks/gptoss_fp4_h100_slurm.sh @@ -26,7 +26,7 @@ EOF SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) export TORCH_CUDA_ARCH_LIST="9.0" PORT=${PORT:-8888} -MODEL_NAME=${MODEL##*/} + export VLLM_MXFP4_USE_MARLIN=1 set -x @@ -36,7 +36,7 @@ PYTHONNOUSERSITE=1 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --tensor-parallel-size=$TP \ --max-num-seqs=$CONC \ --disable-log-requests \ - --served-model-name $MODEL_NAME > $SERVER_LOG 2>&1 & + --served-model-name $MODEL > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -46,7 +46,7 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S pip install -q datasets pandas run_benchmark_serving \ - --model "$MODEL_NAME" \ + --model "$MODEL" \ --port "$PORT" \ --backend vllm \ --input-len "$ISL" \ diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh index 31689bd4a..abe4c2daf 100644 --- a/benchmarks/gptoss_fp4_h200_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_slurm.sh @@ -39,7 +39,7 @@ EOF SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) export TORCH_CUDA_ARCH_LIST="9.0" PORT=$(( 8888 + $PORT_OFFSET )) -MODEL_NAME=${MODEL##*/} + export VLLM_MXFP4_USE_MARLIN=1 PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT \ @@ -48,7 +48,7 @@ PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT \ --tensor-parallel-size $TP \ --max-num-seqs $CONC \ --disable-log-requests \ - --served-model-name $MODEL_NAME > $SERVER_LOG 2>&1 & + --served-model-name $MODEL > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -56,7 +56,7 @@ SERVER_PID=$! wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" run_benchmark_serving \ - --model "$MODEL_NAME" \ + --model "$MODEL" \ --port "$PORT" \ --backend vllm \ --input-len "$ISL" \ diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh index b9fb586df..1019f2086 100644 --- a/benchmarks/gptoss_fp4_mi300x_docker.sh +++ b/benchmarks/gptoss_fp4_mi300x_docker.sh @@ -30,7 +30,6 @@ export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) -MODEL_NAME=${MODEL##*/} set -x vllm serve $MODEL --port $PORT \ @@ -42,7 +41,7 @@ vllm serve $MODEL --port $PORT \ --block-size=64 \ --no-enable-prefix-caching \ --disable-log-requests \ ---served-model-name $MODEL_NAME \ +--served-model-name $MODEL \ --async-scheduling > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -51,7 +50,7 @@ SERVER_PID=$! wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" run_benchmark_serving \ - --model "$MODEL_NAME" \ + --model "$MODEL" \ --port "$PORT" \ --backend vllm \ --input-len "$ISL" \ diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh index ba8dd29ad..16f9729ac 100644 --- a/benchmarks/gptoss_fp4_mi325x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh @@ -33,7 +33,6 @@ fi export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 -MODEL_NAME=${MODEL##*/} set -x vllm serve $MODEL --port $PORT \ @@ -45,7 +44,7 @@ vllm serve $MODEL --port $PORT \ --block-size=64 \ --no-enable-prefix-caching \ --disable-log-requests \ ---served-model-name $MODEL_NAME \ +--served-model-name $MODEL \ --async-scheduling > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -54,7 +53,7 @@ SERVER_PID=$! wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" run_benchmark_serving \ - --model "$MODEL_NAME" \ + --model "$MODEL" \ --port "$PORT" \ --backend vllm \ --input-len "$ISL" \ diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh index d04104268..6b772be75 100644 --- a/benchmarks/gptoss_fp4_mi355x_docker.sh +++ b/benchmarks/gptoss_fp4_mi355x_docker.sh @@ -27,7 +27,6 @@ export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_FUSED_MOE_A16W4=1 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) -MODEL_NAME=${MODEL##*/} set -x vllm serve $MODEL --port $PORT \ @@ -39,7 +38,7 @@ vllm serve $MODEL --port $PORT \ --block-size=64 \ --no-enable-prefix-caching \ --disable-log-requests \ ---served-model-name $MODEL_NAME \ +--served-model-name $MODEL \ --async-scheduling > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -48,7 +47,7 @@ SERVER_PID=$! wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" run_benchmark_serving \ - --model "$MODEL_NAME" \ + --model "$MODEL" \ --port "$PORT" \ --backend vllm \ --input-len "$ISL" \ From e28631cdf5e81f1df4f0df2c7cd38db0eac886af Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Tue, 20 Jan 2026 11:20:59 -0800 Subject: [PATCH 206/214] More evals for spec decode --- benchmarks/benchmark_lib.sh | 1 + utils/collect_eval_results.py | 15 +++++++------ utils/matrix_logic/generate_sweep_configs.py | 22 ++++++++++++++------ utils/summarize.py | 1 + 4 files changed, 25 insertions(+), 14 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index ba214c61e..8533d54bd 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -398,6 +398,7 @@ append_lm_eval_summary() { { "framework": "${fw:-unknown}", "precision": "${prec:-unknown}", + "spec_decoding": "${SPEC_DECODING}", "tp": ${TP:-1}, "conc": ${CONC:-1}, "ep": ${EP_SIZE:-1}, diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py index 5ffaa0cc9..8bf3cf66b 100644 --- a/utils/collect_eval_results.py +++ b/utils/collect_eval_results.py @@ -8,8 +8,9 @@ # Import shared utilities from summarize sys.path.insert(0, str(Path(__file__).resolve().parent)) from summarize import ( - load_json, MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL, - TP, EP, CONC, DP_ATTENTION, TASK, SCORE, EM_STRICT, EM_FLEXIBLE, N_EFF + load_json, MODEL, HARDWARE, FRAMEWORK, PRECISION, + TP, EP, CONC, DP_ATTENTION, TASK, SCORE, EM_STRICT, EM_FLEXIBLE, N_EFF, + SPEC_DECODING ) @@ -160,8 +161,7 @@ def build_row(meta: Dict[str, Any], m: Dict[str, Any]) -> Dict[str, Any]: 'hw': meta.get('hw', 'unknown').upper(), 'framework': meta.get('framework', 'unknown').lower(), 'precision': meta.get('precision', 'unknown').lower(), - 'isl': int(meta.get('isl', 0)), - 'osl': int(meta.get('osl', 0)), + 'spec_decoding': meta.get('spec_decoding', 'unknown'), 'tp': int(meta.get('tp', 1)), 'ep': int(meta.get('ep', 1)), 'conc': int(meta.get('conc', 0)), @@ -221,7 +221,7 @@ def main(): # Sort for stable output rows.sort(key=lambda r: ( - r['hw'], r['framework'], r['precision'], r['isl'], r['osl'], r['tp'], r['ep'], r['conc'] + r['hw'], r['framework'], r['precision'], r.get('spec_decoding', ''), r['tp'], r['ep'], r['conc'] )) if not rows: @@ -229,7 +229,7 @@ def main(): else: # Print table using tabulate headers = [ - MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL, TP, EP, CONC, DP_ATTENTION, + MODEL, HARDWARE, FRAMEWORK, PRECISION, SPEC_DECODING, TP, EP, CONC, DP_ATTENTION, TASK, SCORE, EM_STRICT, EM_FLEXIBLE, N_EFF ] @@ -239,8 +239,7 @@ def main(): r['hw'], r['framework'].upper(), r['precision'].upper(), - r['isl'], - r['osl'], + r['spec_decoding'], r['tp'], r['ep'], r['conc'], diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py index ecedda9ef..b6c2cf2f2 100644 --- a/utils/matrix_logic/generate_sweep_configs.py +++ b/utils/matrix_logic/generate_sweep_configs.py @@ -1,3 +1,4 @@ +from ast import For import json import argparse import sys @@ -32,14 +33,19 @@ def seq_len_to_str(isl: int, osl: int) -> str: return seq_len_itos.get((isl, osl), f"{isl}_{osl}") def mark_eval_entries(matrix_values: list[dict]) -> list[dict]: - """Mark entries that should run evaluation. - - For each unique (model, runner, framework, precision, isl, osl) combination: - - Mark highest TP with highest conc - - Mark lowest TP with highest conc + """Eval selection policy (single-node only): + - Only consider 1k8k (isl=1024, osl=8192). + - For each unique (model, runner, framework, precision, isl, osl, spec-decoding): + - Mark highest TP with highest conc + - Mark lowest TP with highest conc + + Grouping includes spec-decoding so MTP (mtp) and non-MTP (none) are treated + independently. """ from collections import defaultdict + # Only run evals on 1k8k + target_isl, target_osl = seq_len_stoi["1k8k"] # Group entries by (model, runner, framework, precision, isl, osl) # Only include entries that have a top-level TP (i.e., single-node schema). # This avoids relying on structural hints like prefill/decode which may be @@ -50,13 +56,17 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]: if Fields.TP.value not in entry: continue + if entry.get(Fields.ISL.value) != target_isl or entry.get(Fields.OSL.value) != target_osl: + continue + key = ( entry[Fields.MODEL.value], entry[Fields.RUNNER.value], entry[Fields.FRAMEWORK.value], entry[Fields.PRECISION.value], entry[Fields.ISL.value], - entry[Fields.OSL.value] + entry[Fields.OSL.value], + entry[Fields.SPEC_DECODING.value] ) groups[key].append((i, entry)) diff --git a/utils/summarize.py b/utils/summarize.py index 5e248164f..b4f4ce6a1 100644 --- a/utils/summarize.py +++ b/utils/summarize.py @@ -40,6 +40,7 @@ EM_STRICT = "EM Strict" EM_FLEXIBLE = "EM Flexible" N_EFF = "N (eff)" +SPEC_DECODING = "Spec Decode" def load_json(path: Path) -> Optional[Dict[str, Any]]: From fa49cdc2e24a93eee720e17e29a85a61ec80f919 Mon Sep 17 00:00:00 2001 From: Bryan Shan <58582368+Oseltamivir@users.noreply.github.com> Date: Sun, 18 Jan 2026 21:17:45 -0800 Subject: [PATCH 207/214] claude pr comments --- .github/workflows/claude-pr-review.yml | 4 ++++ .github/workflows/claude.yml | 31 +++++++++++++++++++++++--- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/.github/workflows/claude-pr-review.yml b/.github/workflows/claude-pr-review.yml index d52d5aeb4..8886fde16 100644 --- a/.github/workflows/claude-pr-review.yml +++ b/.github/workflows/claude-pr-review.yml @@ -8,6 +8,10 @@ on: pull_request_review_comment: types: [created] +concurrency: + group: pr-review-${{ github.event.pull_request.number }} + cancel-in-progress: false + jobs: review: runs-on: ubuntu-latest diff --git a/.github/workflows/claude.yml b/.github/workflows/claude.yml index d306e2011..dad25f81a 100644 --- a/.github/workflows/claude.yml +++ b/.github/workflows/claude.yml @@ -5,11 +5,17 @@ on: types: [created] issues: types: [opened, assigned] + pull_request_review_comment: + types: [created] + +concurrency: + group: claude-code-${{ github.event.issue.number }} + cancel-in-progress: false jobs: claude: if: | - (github.event_name == 'issue_comment' && contains(github.event.comment.body, '@claude')) || + ((github.event_name == 'issue_comment' || github.event_name == 'pull_request_review_comment') && contains(github.event.comment.body, '@claude')) || (github.event_name == 'issues' && (contains(github.event.issue.body, '@claude') || contains(github.event.issue.title, '@claude'))) runs-on: ubuntu-latest permissions: @@ -43,9 +49,20 @@ jobs: trigger_phrase: "@claude" track_progress: true allowed_bots: '' + + mcp_config: | + { + "mcpServers": { + "fetch": { + "command": "npx", + "args": ["-y", "@anthropic-ai/mcp-server-fetch@latest"] + } + } + } + claude_args: | --model ${{ contains(github.event.comment.body || github.event.issue.body || '', '@claude sonnet') && 'claude-sonnet-4-5-20250929' || contains(github.event.comment.body || github.event.issue.body || '', '@claude haiku') && 'claude-haiku-4-5-20251001' || 'claude-opus-4-5-20251101' }} - --allowedTools "Write,Edit,Read,Glob,Grep,mcp__github__*,mcp__github_inline_comment__create_inline_comment,Bash(*,timeout=28800000)" + --allowedTools "Write,Edit,Read,Glob,Grep,mcp__github__*,mcp__github_inline_comment__create_inline_comment,mcp__fetch__*,Bash(*,timeout=28800000)" prompt: | REPO: ${{ github.repository }} PR/ISSUE NUMBER: ${{ github.event.pull_request.number || github.event.issue.number }} @@ -67,7 +84,7 @@ jobs: You can analyze the json with: ```bash - python3 <<'EOF'\nimport json... + python3 <<'EOF'\nimport json \nwith open('agg_bmk.json') as f: data = json.load(f) \n# Your analysis code here \nEOF ``` To trigger e2e tests, use the `mcp__github__run_workflow` tool to directly dispatch the e2e-tests.yml workflow. @@ -135,3 +152,11 @@ jobs: After triggering, monitor the workflow run using the returned run_id. Focus on: code quality, benchmark config changes, and performance impact. + + ## Web Access: + You have internet access via MCP servers: + - `mcp__fetch__fetch` - Fetch content from any URL + + ### Useful Documentation URLs: + - **sglang**: https://docs.sglang.ai/ + - **vLLM**: https://docs.vllm.ai/en/latest/ From 7e628ff0e58686e73dcba4006b6a785794aa2960 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 19 Jan 2026 11:52:54 -0800 Subject: [PATCH 208/214] chore(deps): bump the github-actions group with 2 updates (#488) --- .github/workflows/claude.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/claude.yml b/.github/workflows/claude.yml index dad25f81a..35c9df757 100644 --- a/.github/workflows/claude.yml +++ b/.github/workflows/claude.yml @@ -27,13 +27,13 @@ jobs: steps: - name: Generate GitHub App token id: app-token - uses: actions/create-github-app-token@v1 + uses: actions/create-github-app-token@v2 with: app-id: ${{ secrets.APP_ID }} private-key: ${{ secrets.APP_PRIVATE_KEY }} - name: Checkout repository - uses: actions/checkout@v5 + uses: actions/checkout@v6 with: fetch-depth: 0 token: ${{ steps.app-token.outputs.token }} From 518d00417b796745c49ccf7721bc247b077cdabf Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Mon, 19 Jan 2026 16:17:10 -0500 Subject: [PATCH 209/214] fix: update ep metadata in gb200 dynamo sglang configs to match comments (#486) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update ep values to use the formula: EP = (NODES × 4 GPUs) / num-workers for both dsr1-fp8-gb200-dynamo-sglang and dsr1-fp4-gb200-dynamo-sglang configurations. The metadata isn't used by sglang dynamo scripts (values are hardcoded), but the frontend uses these values. Fixes #485 Co-authored-by: claude[bot] <41898282+claude[bot]@users.noreply.github.com> Co-authored-by: functionstackx --- .github/configs/nvidia-master.yaml | 48 +++++++++++++++--------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 5ffc6f754..06c37888a 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -906,7 +906,7 @@ dsr1-fp8-gb200-dynamo-sglang: # tp, ep, and dp-attn do nothing because they are hardcoded in the following file: # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh tp: 1 - ep: 1 + ep: 8 dp-attn: true additional-settings: - "PREFILL_NODES=4" @@ -915,7 +915,7 @@ dsr1-fp8-gb200-dynamo-sglang: decode: num-worker: 1 tp: 1 - ep: 1 + ep: 32 dp-attn: true additional-settings: - "DECODE_NODES=8" @@ -928,7 +928,7 @@ dsr1-fp8-gb200-dynamo-sglang: # tp, ep, and dp-attn do nothing because they are hardcoded in the following file: # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh tp: 1 - ep: 1 + ep: 4 dp-attn: true additional-settings: - "PREFILL_NODES=1" @@ -937,7 +937,7 @@ dsr1-fp8-gb200-dynamo-sglang: decode: num-worker: 4 tp: 1 - ep: 1 + ep: 4 dp-attn: true additional-settings: - "DECODE_NODES=4" @@ -950,7 +950,7 @@ dsr1-fp8-gb200-dynamo-sglang: # tp, ep, and dp-attn do nothing because they are hardcoded in the following file: # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh tp: 1 - ep: 1 + ep: 8 dp-attn: true additional-settings: - "PREFILL_NODES=6" @@ -959,7 +959,7 @@ dsr1-fp8-gb200-dynamo-sglang: decode: num-worker: 1 tp: 1 - ep: 1 + ep: 48 dp-attn: true additional-settings: - "DECODE_NODES=12" @@ -973,7 +973,7 @@ dsr1-fp8-gb200-dynamo-sglang: prefill: num-worker: 1 tp: 1 - ep: 1 + ep: 4 dp-attn: true additional-settings: - "PREFILL_NODES=1" @@ -982,7 +982,7 @@ dsr1-fp8-gb200-dynamo-sglang: decode: num-worker: 1 tp: 1 - ep: 1 + ep: 4 dp-attn: true additional-settings: - "DECODE_NODES=1" @@ -993,7 +993,7 @@ dsr1-fp8-gb200-dynamo-sglang: prefill: num-worker: 5 tp: 1 - ep: 1 + ep: 8 dp-attn: true additional-settings: - "PREFILL_NODES=10" @@ -1002,7 +1002,7 @@ dsr1-fp8-gb200-dynamo-sglang: decode: num-worker: 1 tp: 1 - ep: 1 + ep: 32 dp-attn: true additional-settings: - "DECODE_NODES=8" @@ -1029,7 +1029,7 @@ dsr1-fp4-gb200-dynamo-sglang: prefill: num-worker: 1 tp: 1 - ep: 1 + ep: 4 dp-attn: true additional-settings: - "PREFILL_NODES=1" @@ -1038,7 +1038,7 @@ dsr1-fp4-gb200-dynamo-sglang: decode: num-worker: 2 tp: 1 - ep: 1 + ep: 4 dp-attn: true additional-settings: - "DECODE_NODES=2" @@ -1049,7 +1049,7 @@ dsr1-fp4-gb200-dynamo-sglang: prefill: num-worker: 4 tp: 1 - ep: 1 + ep: 4 dp-attn: true additional-settings: - "PREFILL_NODES=4" @@ -1058,7 +1058,7 @@ dsr1-fp4-gb200-dynamo-sglang: decode: num-worker: 1 tp: 1 - ep: 1 + ep: 48 dp-attn: true additional-settings: - "DECODE_NODES=12" @@ -1069,7 +1069,7 @@ dsr1-fp4-gb200-dynamo-sglang: prefill: num-worker: 4 tp: 1 - ep: 1 + ep: 4 dp-attn: true additional-settings: - "PREFILL_NODES=4" @@ -1078,7 +1078,7 @@ dsr1-fp4-gb200-dynamo-sglang: decode: num-worker: 1 tp: 1 - ep: 1 + ep: 32 dp-attn: true additional-settings: - "DECODE_NODES=8" @@ -1090,7 +1090,7 @@ dsr1-fp4-gb200-dynamo-sglang: prefill: num-worker: 1 tp: 1 - ep: 1 + ep: 4 dp-attn: false additional-settings: - "PREFILL_NODES=1" @@ -1099,7 +1099,7 @@ dsr1-fp4-gb200-dynamo-sglang: decode: num-worker: 4 tp: 1 - ep: 1 + ep: 4 dp-attn: true additional-settings: - "DECODE_NODES=4" @@ -1108,7 +1108,7 @@ dsr1-fp4-gb200-dynamo-sglang: prefill: num-worker: 6 tp: 1 - ep: 1 + ep: 4 dp-attn: false additional-settings: - "PREFILL_NODES=6" @@ -1117,7 +1117,7 @@ dsr1-fp4-gb200-dynamo-sglang: decode: num-worker: 1 tp: 1 - ep: 1 + ep: 48 dp-attn: true additional-settings: - "DECODE_NODES=12" @@ -1126,7 +1126,7 @@ dsr1-fp4-gb200-dynamo-sglang: prefill: num-worker: 10 tp: 1 - ep: 1 + ep: 4 dp-attn: true additional-settings: - "PREFILL_NODES=10" @@ -1135,7 +1135,7 @@ dsr1-fp4-gb200-dynamo-sglang: decode: num-worker: 1 tp: 1 - ep: 1 + ep: 32 dp-attn: true additional-settings: - "DECODE_NODES=8" @@ -1144,7 +1144,7 @@ dsr1-fp4-gb200-dynamo-sglang: prefill: num-worker: 10 tp: 1 - ep: 1 + ep: 4 dp-attn: true additional-settings: - "PREFILL_NODES=10" @@ -1153,7 +1153,7 @@ dsr1-fp4-gb200-dynamo-sglang: decode: num-worker: 1 tp: 1 - ep: 1 + ep: 32 dp-attn: true additional-settings: - "DECODE_NODES=8" From 388020f78ba3ceec4714a53e07ece6418d894c06 Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Mon, 19 Jan 2026 16:35:46 -0500 Subject: [PATCH 210/214] Experimental folder (increasing researcher/developer velocity) (#489) --- experimental/.gitignore | 1 + experimental/README.md | 5 +++++ experimental/multiturn/README.md | 14 ++++++++++++++ 3 files changed, 20 insertions(+) create mode 100644 experimental/.gitignore create mode 100644 experimental/README.md create mode 100644 experimental/multiturn/README.md diff --git a/experimental/.gitignore b/experimental/.gitignore new file mode 100644 index 000000000..735d7060f --- /dev/null +++ b/experimental/.gitignore @@ -0,0 +1 @@ +rocm-libraries/ \ No newline at end of file diff --git a/experimental/README.md b/experimental/README.md new file mode 100644 index 000000000..f39dfc4af --- /dev/null +++ b/experimental/README.md @@ -0,0 +1,5 @@ +# Experimental + +This folder contains experimental WIP code that is mostly Claude Code generated. + +**Warning:** Code in this directory is very basic and likely contains errors or incomplete implementations. It is not intended for production use or as part of the official InferenceMAX results. diff --git a/experimental/multiturn/README.md b/experimental/multiturn/README.md new file mode 100644 index 000000000..358b53991 --- /dev/null +++ b/experimental/multiturn/README.md @@ -0,0 +1,14 @@ +## Experimental WIP: Multi turn with/without CPU KVCache Offloading + +lit review +- https://lmsys.org/blog/2025-09-10-sglang-hicache/ +- sglang calls GPU HBM as (L1) and CPU DRAM as (L2) +- https://lmsys.org/images/blog/hicache/mooncake_benchmark.png +- single turn long context Q&A https://arxiv.org/abs/2311.04939 (seems more like an shared prefix style similar to cascade attention (pre cursor to sglang radix attention )) https://flashinfer.ai/2024/02/02/cascade-inference.html +- Production Alibiba Multi turn dataset https://arxiv.org/abs/2506.02634 (seem to not provide the acutal prompts and outputs tho, more just prompt lengths and output lengths, etc.) +- sglang synthetic multi turn benchmark script here https://github.com/sgl-project/sglang/tree/main/benchmark/hicache +- interestingly sglang blog simulates PD disagg via just setting OSL as 1 +```bash +python3 benchmark/hicache/bench_multiturn.py --model-path $MODEL_PATH --disable-random-sample \ +--output-length 1 --request-length 2048 \ # simulate P-D disaggregation +``` \ No newline at end of file From ef15b99f8fee104f9784a64cbdfcd2b9eb283a7e Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Wed, 21 Jan 2026 09:30:20 -0800 Subject: [PATCH 211/214] summary table --- utils/collect_eval_results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py index 8bf3cf66b..8b471034c 100644 --- a/utils/collect_eval_results.py +++ b/utils/collect_eval_results.py @@ -165,7 +165,7 @@ def build_row(meta: Dict[str, Any], m: Dict[str, Any]) -> Dict[str, Any]: 'tp': int(meta.get('tp', 1)), 'ep': int(meta.get('ep', 1)), 'conc': int(meta.get('conc', 0)), - 'dp_attention': str(meta.get('dp_attention', False)).lower(), + 'dp_attention': str(meta.get('dp_attention', "none")).lower(), 'task': m.get('task', 'unknown'), 'em_strict': m.get('strict'), 'em_strict_se': m.get('strict_se'), From 62079d67994179524748b476d9f81ec1dbb2dc61 Mon Sep 17 00:00:00 2001 From: Bryan Shan <58582368+Oseltamivir@users.noreply.github.com> Date: Wed, 21 Jan 2026 10:42:03 -0800 Subject: [PATCH 212/214] Remove git installation and repository cloning Removed git installation check and cloning of bench_serving repository. --- benchmarks/benchmark_lib.sh | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 42f57f762..f48e4927c 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -183,20 +183,6 @@ run_benchmark_serving() { esac done - # Check if git is installed, install if missing - if ! command -v git &> /dev/null; then - echo "git not found, installing..." - if command -v apt-get &> /dev/null; then - sudo apt-get update && sudo apt-get install -y git - else - echo "Error: Could not install git. Package manager not found." - return 1 - fi - fi - - local BENCH_SERVING_DIR - BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) - git clone https://github.com/kimbochen/bench_serving.git "$BENCH_SERVING_DIR" # Validate all required parameters if [[ -z "$model" ]]; then echo "Error: --model is required" From 5409158d0a50e520581289cf3ab6971a91a60897 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Wed, 21 Jan 2026 11:33:31 -0800 Subject: [PATCH 213/214] evals final --- benchmarks/dsr1_fp4_b200_trt_slurm.sh | 4 ++-- benchmarks/dsr1_fp8_b200_trt_slurm.sh | 4 ++-- benchmarks/dsr1_fp8_h200_trt_slurm.sh | 4 ++-- benchmarks/gptoss_fp4_b200_docker.sh | 3 +-- benchmarks/gptoss_fp4_h100_docker.sh | 5 +---- benchmarks/gptoss_fp4_h100_slurm.sh | 3 +-- benchmarks/gptoss_fp4_h200_slurm.sh | 3 +-- benchmarks/gptoss_fp4_mi300x_docker.sh | 1 - benchmarks/gptoss_fp4_mi325x_slurm.sh | 1 - benchmarks/gptoss_fp4_mi355x_docker.sh | 1 - 10 files changed, 10 insertions(+), 19 deletions(-) diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh index 65edb89d1..7886da1c9 100644 --- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh @@ -88,8 +88,8 @@ fi set -x MAX_NUM_TOKENS=$(( ($CONC+$ISL+64+63)/64*64 )) -MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 4096 ? MAX_MODEL_LEN : 4096 )) -MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 4096 ? MAX_NUM_TOKENS : 4096 )) +MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 )) +MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 )) # Launch TRT-LLM server mpirun -n 1 --oversubscribe --allow-run-as-root \ diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh index 6fb75eeee..42a8cfd3e 100644 --- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh @@ -58,8 +58,8 @@ fi set -x MAX_NUM_TOKENS=$(( ($CONC+$ISL+64+63)/64*64 )) -MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 4096 ? MAX_MODEL_LEN : 4096 )) -MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 4096 ? MAX_NUM_TOKENS : 4096 )) +MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 )) +MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 )) # Launch TRT-LLM server mpirun -n 1 --oversubscribe --allow-run-as-root \ diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh index bc8fffea7..b72df9577 100644 --- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh @@ -58,8 +58,8 @@ fi set -x MAX_NUM_TOKENS=$(( (CONC + ISL + 64 + 63) / 64 * 64 )) -MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 4096 ? MAX_MODEL_LEN : 4096 )) -MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 4096 ? MAX_NUM_TOKENS : 4096 )) +MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 )) +MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 )) # Launch TRT-LLM server PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \ diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh index 2702cd477..841b9df7a 100644 --- a/benchmarks/gptoss_fp4_b200_docker.sh +++ b/benchmarks/gptoss_fp4_b200_docker.sh @@ -52,8 +52,7 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT \ --gpu-memory-utilization 0.9 \ --tensor-parallel-size $TP \ --max-num-seqs 512 \ ---disable-log-requests \ ---served-model-name $MODEL > $SERVER_LOG 2>&1 & +--disable-log-requests > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index a4c848119..aa02bf286 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -13,8 +13,6 @@ check_env_vars \ RANDOM_RANGE_RATIO \ RESULT_FILENAME - -# Create a basic vLLM config cat > config.yaml << EOF async-scheduling: true no-enable-prefix-caching: true @@ -33,8 +31,7 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --gpu-memory-utilization=0.9 \ --tensor-parallel-size=$TP \ --max-num-seqs=$CONC \ ---disable-log-requests \ ---served-model-name $MODEL > $SERVER_LOG 2>&1 & +--disable-log-requests > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh index 2e44f95ed..c89104790 100644 --- a/benchmarks/gptoss_fp4_h100_slurm.sh +++ b/benchmarks/gptoss_fp4_h100_slurm.sh @@ -35,8 +35,7 @@ PYTHONNOUSERSITE=1 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --gpu-memory-utilization=0.9 \ --tensor-parallel-size=$TP \ --max-num-seqs=$CONC \ - --disable-log-requests \ - --served-model-name $MODEL > $SERVER_LOG 2>&1 & + --disable-log-requests > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh index abe4c2daf..4504b9417 100644 --- a/benchmarks/gptoss_fp4_h200_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_slurm.sh @@ -47,8 +47,7 @@ PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT \ --gpu-memory-utilization 0.9 \ --tensor-parallel-size $TP \ --max-num-seqs $CONC \ - --disable-log-requests \ - --served-model-name $MODEL > $SERVER_LOG 2>&1 & + --disable-log-requests > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh index 1019f2086..1dfd0c343 100644 --- a/benchmarks/gptoss_fp4_mi300x_docker.sh +++ b/benchmarks/gptoss_fp4_mi300x_docker.sh @@ -41,7 +41,6 @@ vllm serve $MODEL --port $PORT \ --block-size=64 \ --no-enable-prefix-caching \ --disable-log-requests \ ---served-model-name $MODEL \ --async-scheduling > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh index 16f9729ac..255bb3df5 100644 --- a/benchmarks/gptoss_fp4_mi325x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh @@ -44,7 +44,6 @@ vllm serve $MODEL --port $PORT \ --block-size=64 \ --no-enable-prefix-caching \ --disable-log-requests \ ---served-model-name $MODEL \ --async-scheduling > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh index 0be8558a9..651f1da67 100644 --- a/benchmarks/gptoss_fp4_mi355x_docker.sh +++ b/benchmarks/gptoss_fp4_mi355x_docker.sh @@ -37,7 +37,6 @@ vllm serve $MODEL --port $PORT \ --block-size=64 \ --no-enable-prefix-caching \ --disable-log-requests \ ---served-model-name $MODEL \ --async-scheduling > $SERVER_LOG 2>&1 & SERVER_PID=$! From 9ae0f9066be1ed68730aae8d177e850537e6199c Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Wed, 21 Jan 2026 14:09:51 -0800 Subject: [PATCH 214/214] more retries, lower conc, for stability --- benchmarks/benchmark_lib.sh | 2 +- benchmarks/dsr1_fp4_b200_docker.sh | 2 +- benchmarks/dsr1_fp4_b200_slurm.sh | 2 +- benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh | 2 +- benchmarks/dsr1_fp4_b200_trt_slurm.sh | 2 +- benchmarks/dsr1_fp4_mi355x_atom_slurm.sh | 2 +- benchmarks/dsr1_fp4_mi355x_docker.sh | 2 +- benchmarks/dsr1_fp4_mi355x_slurm.sh | 2 +- benchmarks/dsr1_fp8_b200_docker.sh | 2 +- benchmarks/dsr1_fp8_b200_slurm.sh | 2 +- benchmarks/dsr1_fp8_b200_trt_slurm.sh | 2 +- benchmarks/dsr1_fp8_h200_slurm.sh | 2 +- benchmarks/dsr1_fp8_h200_trt_slurm.sh | 2 +- benchmarks/dsr1_fp8_mi300x_docker.sh | 2 +- benchmarks/dsr1_fp8_mi300x_slurm.sh | 2 +- benchmarks/dsr1_fp8_mi325x_docker.sh | 2 +- benchmarks/dsr1_fp8_mi325x_slurm.sh | 2 +- benchmarks/dsr1_fp8_mi355x_atom_slurm.sh | 2 +- benchmarks/dsr1_fp8_mi355x_docker.sh | 2 +- benchmarks/dsr1_fp8_mi355x_slurm.sh | 2 +- benchmarks/gptoss_fp4_b200_docker.sh | 2 +- benchmarks/gptoss_fp4_b200_slurm.sh | 2 +- benchmarks/gptoss_fp4_h100_docker.sh | 2 +- benchmarks/gptoss_fp4_h100_slurm.sh | 2 +- benchmarks/gptoss_fp4_h200_slurm.sh | 2 +- benchmarks/gptoss_fp4_h200_trt_slurm.sh | 2 +- benchmarks/gptoss_fp4_mi300x_docker.sh | 2 +- benchmarks/gptoss_fp4_mi300x_slurm.sh | 2 +- benchmarks/gptoss_fp4_mi325x_docker.sh | 2 +- benchmarks/gptoss_fp4_mi325x_slurm.sh | 2 +- benchmarks/gptoss_fp4_mi355x_atom_slurm.sh | 2 +- benchmarks/gptoss_fp4_mi355x_docker.sh | 2 +- benchmarks/gptoss_fp4_mi355x_slurm.sh | 2 +- 33 files changed, 33 insertions(+), 33 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index f48e4927c..cafa5347f 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -412,7 +412,7 @@ run_lm_eval() { --tasks "utils/evals/${task}.yaml" \ --num_fewshot "${num_fewshot}" \ --output_path "${results_dir}" --log_samples \ - --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=,max_retries=2,num_concurrent=${concurrent_requests},tokenized_requests=False,max_length=${gen_max_tokens}" \ + --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=,max_retries=5,num_concurrent=${concurrent_requests},tokenized_requests=False,max_length=${gen_max_tokens}" \ --gen_kwargs "max_tokens=${gen_max_tokens},temperature=${temperature},top_p=${top_p}" local eval_exit=$? set +x diff --git a/benchmarks/dsr1_fp4_b200_docker.sh b/benchmarks/dsr1_fp4_b200_docker.sh index a5d919f9f..30e564dd9 100644 --- a/benchmarks/dsr1_fp4_b200_docker.sh +++ b/benchmarks/dsr1_fp4_b200_docker.sh @@ -61,7 +61,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC append_lm_eval_summary fi set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp4_b200_slurm.sh b/benchmarks/dsr1_fp4_b200_slurm.sh index 875ad19f5..0da2913d2 100644 --- a/benchmarks/dsr1_fp4_b200_slurm.sh +++ b/benchmarks/dsr1_fp4_b200_slurm.sh @@ -58,7 +58,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC append_lm_eval_summary fi set +x diff --git a/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh index 104a33ca2..dce21701c 100644 --- a/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh +++ b/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh @@ -105,7 +105,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC append_lm_eval_summary fi set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh index 7886da1c9..459cff1b3 100644 --- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh @@ -121,7 +121,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC append_lm_eval_summary fi set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp4_mi355x_atom_slurm.sh b/benchmarks/dsr1_fp4_mi355x_atom_slurm.sh index 786c879be..a63039af3 100644 --- a/benchmarks/dsr1_fp4_mi355x_atom_slurm.sh +++ b/benchmarks/dsr1_fp4_mi355x_atom_slurm.sh @@ -66,7 +66,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC append_lm_eval_summary fi set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp4_mi355x_docker.sh b/benchmarks/dsr1_fp4_mi355x_docker.sh index d0807a1c2..ba19b64e3 100644 --- a/benchmarks/dsr1_fp4_mi355x_docker.sh +++ b/benchmarks/dsr1_fp4_mi355x_docker.sh @@ -57,7 +57,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC append_lm_eval_summary fi set +x diff --git a/benchmarks/dsr1_fp4_mi355x_slurm.sh b/benchmarks/dsr1_fp4_mi355x_slurm.sh index c381f199d..63856676e 100644 --- a/benchmarks/dsr1_fp4_mi355x_slurm.sh +++ b/benchmarks/dsr1_fp4_mi355x_slurm.sh @@ -58,7 +58,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC append_lm_eval_summary fi set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_b200_docker.sh b/benchmarks/dsr1_fp8_b200_docker.sh index 73497dd3c..dd19b94a0 100644 --- a/benchmarks/dsr1_fp8_b200_docker.sh +++ b/benchmarks/dsr1_fp8_b200_docker.sh @@ -93,7 +93,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC append_lm_eval_summary fi set +x diff --git a/benchmarks/dsr1_fp8_b200_slurm.sh b/benchmarks/dsr1_fp8_b200_slurm.sh index 76549bbd5..da1a7f4cd 100644 --- a/benchmarks/dsr1_fp8_b200_slurm.sh +++ b/benchmarks/dsr1_fp8_b200_slurm.sh @@ -90,7 +90,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC append_lm_eval_summary fi set +x diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh index 42a8cfd3e..1602d802b 100644 --- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh @@ -91,7 +91,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC append_lm_eval_summary fi set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_h200_slurm.sh b/benchmarks/dsr1_fp8_h200_slurm.sh index 657504290..117008a63 100644 --- a/benchmarks/dsr1_fp8_h200_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_slurm.sh @@ -62,7 +62,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC append_lm_eval_summary fi set +x diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh index b72df9577..98a6de420 100644 --- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh @@ -91,7 +91,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC append_lm_eval_summary fi set +x diff --git a/benchmarks/dsr1_fp8_mi300x_docker.sh b/benchmarks/dsr1_fp8_mi300x_docker.sh index ca2bbdd56..c7de3eec5 100644 --- a/benchmarks/dsr1_fp8_mi300x_docker.sh +++ b/benchmarks/dsr1_fp8_mi300x_docker.sh @@ -60,7 +60,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC append_lm_eval_summary fi set +x diff --git a/benchmarks/dsr1_fp8_mi300x_slurm.sh b/benchmarks/dsr1_fp8_mi300x_slurm.sh index 3c3c00029..f4e029fe5 100644 --- a/benchmarks/dsr1_fp8_mi300x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi300x_slurm.sh @@ -65,7 +65,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC append_lm_eval_summary fi set +x diff --git a/benchmarks/dsr1_fp8_mi325x_docker.sh b/benchmarks/dsr1_fp8_mi325x_docker.sh index be756fa6c..c990ef2a1 100644 --- a/benchmarks/dsr1_fp8_mi325x_docker.sh +++ b/benchmarks/dsr1_fp8_mi325x_docker.sh @@ -51,7 +51,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC append_lm_eval_summary fi set +x diff --git a/benchmarks/dsr1_fp8_mi325x_slurm.sh b/benchmarks/dsr1_fp8_mi325x_slurm.sh index fd2fc3886..82f0833ff 100644 --- a/benchmarks/dsr1_fp8_mi325x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi325x_slurm.sh @@ -54,7 +54,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC append_lm_eval_summary fi set +x diff --git a/benchmarks/dsr1_fp8_mi355x_atom_slurm.sh b/benchmarks/dsr1_fp8_mi355x_atom_slurm.sh index 786c879be..a63039af3 100644 --- a/benchmarks/dsr1_fp8_mi355x_atom_slurm.sh +++ b/benchmarks/dsr1_fp8_mi355x_atom_slurm.sh @@ -66,7 +66,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC append_lm_eval_summary fi set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_mi355x_docker.sh b/benchmarks/dsr1_fp8_mi355x_docker.sh index c207802d9..f6527e9b7 100644 --- a/benchmarks/dsr1_fp8_mi355x_docker.sh +++ b/benchmarks/dsr1_fp8_mi355x_docker.sh @@ -55,7 +55,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC append_lm_eval_summary fi set +x diff --git a/benchmarks/dsr1_fp8_mi355x_slurm.sh b/benchmarks/dsr1_fp8_mi355x_slurm.sh index a90fc1067..078a9ec48 100644 --- a/benchmarks/dsr1_fp8_mi355x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi355x_slurm.sh @@ -53,7 +53,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC append_lm_eval_summary fi set +x diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh index 841b9df7a..1a4b55a83 100644 --- a/benchmarks/gptoss_fp4_b200_docker.sh +++ b/benchmarks/gptoss_fp4_b200_docker.sh @@ -75,7 +75,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC append_lm_eval_summary fi set +x diff --git a/benchmarks/gptoss_fp4_b200_slurm.sh b/benchmarks/gptoss_fp4_b200_slurm.sh index a7f507f53..5bcfef9a3 100644 --- a/benchmarks/gptoss_fp4_b200_slurm.sh +++ b/benchmarks/gptoss_fp4_b200_slurm.sh @@ -70,7 +70,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC append_lm_eval_summary fi set +x diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index aa02bf286..2fd6fc67f 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -54,7 +54,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC append_lm_eval_summary fi set +x diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh index c89104790..1b4da9cce 100644 --- a/benchmarks/gptoss_fp4_h100_slurm.sh +++ b/benchmarks/gptoss_fp4_h100_slurm.sh @@ -58,7 +58,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC append_lm_eval_summary fi set +x diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh index 4504b9417..cfea22b9e 100644 --- a/benchmarks/gptoss_fp4_h200_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_slurm.sh @@ -68,7 +68,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC append_lm_eval_summary fi set +x diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh index e06aaa789..875e6ae72 100644 --- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh @@ -77,7 +77,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC append_lm_eval_summary fi set +x \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh index 1dfd0c343..467a32a58 100644 --- a/benchmarks/gptoss_fp4_mi300x_docker.sh +++ b/benchmarks/gptoss_fp4_mi300x_docker.sh @@ -62,7 +62,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC append_lm_eval_summary fi set +x diff --git a/benchmarks/gptoss_fp4_mi300x_slurm.sh b/benchmarks/gptoss_fp4_mi300x_slurm.sh index 1f2901113..bc385c264 100644 --- a/benchmarks/gptoss_fp4_mi300x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi300x_slurm.sh @@ -69,7 +69,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC append_lm_eval_summary fi set +x diff --git a/benchmarks/gptoss_fp4_mi325x_docker.sh b/benchmarks/gptoss_fp4_mi325x_docker.sh index 64d2a7291..054f6c377 100644 --- a/benchmarks/gptoss_fp4_mi325x_docker.sh +++ b/benchmarks/gptoss_fp4_mi325x_docker.sh @@ -61,7 +61,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC append_lm_eval_summary fi set +x diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh index 255bb3df5..c0c9597c2 100644 --- a/benchmarks/gptoss_fp4_mi325x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh @@ -65,7 +65,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC append_lm_eval_summary fi set +x diff --git a/benchmarks/gptoss_fp4_mi355x_atom_slurm.sh b/benchmarks/gptoss_fp4_mi355x_atom_slurm.sh index 9cbf4640e..85052b1bc 100644 --- a/benchmarks/gptoss_fp4_mi355x_atom_slurm.sh +++ b/benchmarks/gptoss_fp4_mi355x_atom_slurm.sh @@ -67,7 +67,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC append_lm_eval_summary fi set +x \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh index 651f1da67..7c708ae62 100644 --- a/benchmarks/gptoss_fp4_mi355x_docker.sh +++ b/benchmarks/gptoss_fp4_mi355x_docker.sh @@ -58,7 +58,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC append_lm_eval_summary fi set +x diff --git a/benchmarks/gptoss_fp4_mi355x_slurm.sh b/benchmarks/gptoss_fp4_mi355x_slurm.sh index bd9633b0c..1e5d87dba 100644 --- a/benchmarks/gptoss_fp4_mi355x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi355x_slurm.sh @@ -61,7 +61,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC append_lm_eval_summary fi set +x