From cd9cb64c41d3d4c861210b5442cac43b6f07e4d5 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 12 Nov 2025 14:19:08 -0600
Subject: [PATCH 001/214] initial poc

---
 benchmarks/gptoss_fp4_h100_docker.sh | 29 +++++++++++-
 benchmarks/gptoss_fp4_h100_slurm.sh  |  1 -
 runners/launch_h100-cr.sh            | 68 ++++++++++++++--------------
 3 files changed, 62 insertions(+), 36 deletions(-)

diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh
index a8bb57c16..39a5abf63 100644
--- a/benchmarks/gptoss_fp4_h100_docker.sh
+++ b/benchmarks/gptoss_fp4_h100_docker.sh
@@ -7,6 +7,9 @@
 # MAX_MODEL_LEN
 # TP
 # CONC
+# ISL
+# OSL
+
 
 cat > config.yaml << EOF
 compilation-config: '{"cudagraph_mode":"PIECEWISE"}'
@@ -18,6 +21,7 @@ max-model-len: 10240
 EOF
 
 export PYTHONNOUSERSITE=1
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 
 set -x
 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
@@ -25,4 +29,27 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
 --gpu-memory-utilization=0.9 \
 --tensor-parallel-size=$TP \
 --max-num-seqs=$CONC  \
---disable-log-requests
+--disable-log-requests > $SERVER_LOG 2>&1 &
+
+set +x
+while IFS= read -r line; do
+    printf '%s\n' "$line"
+    if [[ "$line" =~ Application\ startup\ complete ]]; then
+        break
+    fi
+done < <(tail -F -n0 "$SERVER_LOG")
+
+pip install -q datasets pandas
+git clone https://github.com/kimbochen/bench_serving.git
+set -x
+python3 bench_serving/benchmark_serving.py \
+--model=$MODEL \
+--backend=vllm \
+--base-url=\"http://localhost:$PORT\" \
+--dataset-name=random \
+--random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
+--num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \
+--request-rate=inf --ignore-eos \
+--save-result --percentile-metrics='ttft,tpot,itl,e2el' \
+--result-dir=/workspace/ \
+--result-filename=$RESULT_FILENAME.json"
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh
index d2819b5b3..e9092703a 100644
--- a/benchmarks/gptoss_fp4_h100_slurm.sh
+++ b/benchmarks/gptoss_fp4_h100_slurm.sh
@@ -3,7 +3,6 @@
 # === Required Env Vars === 
 # HF_TOKEN
 # HF_HUB_CACHE
-# IMAGE
 # MODEL
 # ISL
 # OSL
diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh
index 47b350128..1eb58c32e 100644
--- a/runners/launch_h100-cr.sh
+++ b/runners/launch_h100-cr.sh
@@ -4,7 +4,7 @@ HF_HUB_CACHE_MOUNT="/home/ubuntu/hf_hub_cache/"
 PORT=8888
 
 server_name="bmk-server"
-client_name="bmk-client"
+# client_name="bmk-client"
 
 set -x
 docker run --rm -d --network=host --name=$server_name \
@@ -17,38 +17,38 @@ docker run --rm -d --network=host --name=$server_name \
 $IMAGE \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_h100_docker.sh"
 
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" =~ Application\ startup\ complete ]]; then
-        break
-    fi
-done < <(docker logs -f --tail=0 $server_name 2>&1)
-
-if ! docker ps --format "{{.Names}}" | grep -q "$server_name"; then
-    echo "Server container launch failed."
-    exit 1
-fi
-
-git clone https://github.com/kimbochen/bench_serving.git
-
-set -x
-docker run --rm --network=host --name=$client_name \
--v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
--e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \
---entrypoint=/bin/bash \
-$IMAGE \
--lc "pip install -q datasets pandas && \
-python3 bench_serving/benchmark_serving.py \
---model=$MODEL \
---backend=vllm \
---base-url=\"http://localhost:$PORT\" \
---dataset-name=random \
---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
---num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \
---request-rate=inf --ignore-eos \
---save-result --percentile-metrics='ttft,tpot,itl,e2el' \
---result-dir=/workspace/ \
---result-filename=$RESULT_FILENAME.json"
+# set +x
+# while IFS= read -r line; do
+#     printf '%s\n' "$line"
+#     if [[ "$line" =~ Application\ startup\ complete ]]; then
+#         break
+#     fi
+# done < <(docker logs -f --tail=0 $server_name 2>&1)
+
+# if ! docker ps --format "{{.Names}}" | grep -q "$server_name"; then
+#     echo "Server container launch failed."
+#     exit 1
+# fi
+
+# git clone https://github.com/kimbochen/bench_serving.git
+
+# set -x
+# docker run --rm --network=host --name=$client_name \
+# -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
+# -e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \
+# --entrypoint=/bin/bash \
+# $IMAGE \
+# -lc "pip install -q datasets pandas && \
+# python3 bench_serving/benchmark_serving.py \
+# --model=$MODEL \
+# --backend=vllm \
+# --base-url=\"http://localhost:$PORT\" \
+# --dataset-name=random \
+# --random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
+# --num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \
+# --request-rate=inf --ignore-eos \
+# --save-result --percentile-metrics='ttft,tpot,itl,e2el' \
+# --result-dir=/workspace/ \
+# --result-filename=$RESULT_FILENAME.json"
 
 docker stop $server_name

From 00ac64a55cec31c1c6e8761daba271d88a5e6c80 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 12 Nov 2025 14:45:02 -0600
Subject: [PATCH 002/214] remove -d flag when launching docker container

---
 runners/launch_h100-cr.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh
index 1eb58c32e..51def9743 100644
--- a/runners/launch_h100-cr.sh
+++ b/runners/launch_h100-cr.sh
@@ -7,7 +7,7 @@ server_name="bmk-server"
 # client_name="bmk-client"
 
 set -x
-docker run --rm -d --network=host --name=$server_name \
+docker run --rm --network=host --name=$server_name \
 --runtime=nvidia --gpus=all --ipc=host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \
 -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \

From e38b38aa96d5a9eb09c1ad09074c15450159ea41 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 12 Nov 2025 14:50:32 -0600
Subject: [PATCH 003/214] syntax error

---
 benchmarks/gptoss_fp4_h100_docker.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh
index 39a5abf63..3700ea357 100644
--- a/benchmarks/gptoss_fp4_h100_docker.sh
+++ b/benchmarks/gptoss_fp4_h100_docker.sh
@@ -52,4 +52,4 @@ python3 bench_serving/benchmark_serving.py \
 --request-rate=inf --ignore-eos \
 --save-result --percentile-metrics='ttft,tpot,itl,e2el' \
 --result-dir=/workspace/ \
---result-filename=$RESULT_FILENAME.json"
\ No newline at end of file
+--result-filename=$RESULT_FILENAME.json
\ No newline at end of file

From 66eae81b19aacab22a59cf2121f1878c7ff91338 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 12 Nov 2025 14:58:35 -0600
Subject: [PATCH 004/214] compatibility fixes

---
 benchmarks/gptoss_fp4_h100_docker.sh | 11 ++++-------
 runners/launch_h100-cr.sh            |  2 +-
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh
index 3700ea357..aee233793 100644
--- a/benchmarks/gptoss_fp4_h100_docker.sh
+++ b/benchmarks/gptoss_fp4_h100_docker.sh
@@ -5,6 +5,7 @@
 # HF_HUB_CACHE
 # MODEL
 # MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
 # TP
 # CONC
 # ISL
@@ -21,7 +22,6 @@ max-model-len: 10240
 EOF
 
 export PYTHONNOUSERSITE=1
-SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 
 set -x
 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
@@ -32,12 +32,9 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
 --disable-log-requests > $SERVER_LOG 2>&1 &
 
 set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" =~ Application\ startup\ complete ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
+until curl --output /dev/null --silent --head --fail http://localhost:$PORT; do
+    sleep 5
+done
 
 pip install -q datasets pandas
 git clone https://github.com/kimbochen/bench_serving.git
diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh
index 51def9743..8553d9b59 100644
--- a/runners/launch_h100-cr.sh
+++ b/runners/launch_h100-cr.sh
@@ -11,7 +11,7 @@ docker run --rm --network=host --name=$server_name \
 --runtime=nvidia --gpus=all --ipc=host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \
 -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
--e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT \
+-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \
 -e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
 --entrypoint=/bin/bash \
 $IMAGE \

From fdec241c711313934bc112453739ade661e6c01f Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 12 Nov 2025 14:59:51 -0600
Subject: [PATCH 005/214] add correct endpoint prefix

---
 benchmarks/gptoss_fp4_h100_docker.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh
index aee233793..80af1b8e0 100644
--- a/benchmarks/gptoss_fp4_h100_docker.sh
+++ b/benchmarks/gptoss_fp4_h100_docker.sh
@@ -32,7 +32,7 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
 --disable-log-requests > $SERVER_LOG 2>&1 &
 
 set +x
-until curl --output /dev/null --silent --head --fail http://localhost:$PORT; do
+until curl --output /dev/null --silent --head --fail http://localhost:$PORT/health; do
     sleep 5
 done
 

From 08de857790de7726a2b1864fdab2de6c1a573768 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 12 Nov 2025 15:06:41 -0600
Subject: [PATCH 006/214] remove reference env var

---
 benchmarks/gptoss_fp4_h100_docker.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh
index 80af1b8e0..d914e6a06 100644
--- a/benchmarks/gptoss_fp4_h100_docker.sh
+++ b/benchmarks/gptoss_fp4_h100_docker.sh
@@ -29,7 +29,7 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
 --gpu-memory-utilization=0.9 \
 --tensor-parallel-size=$TP \
 --max-num-seqs=$CONC  \
---disable-log-requests > $SERVER_LOG 2>&1 &
+--disable-log-requests
 
 set +x
 until curl --output /dev/null --silent --head --fail http://localhost:$PORT/health; do

From 06231ee3a78ee37eca6a7ed4c58a770e8550041c Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 12 Nov 2025 15:13:14 -0600
Subject: [PATCH 007/214] run vllm serve in background

---
 benchmarks/gptoss_fp4_h100_docker.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh
index d914e6a06..e4efb03ec 100644
--- a/benchmarks/gptoss_fp4_h100_docker.sh
+++ b/benchmarks/gptoss_fp4_h100_docker.sh
@@ -29,10 +29,10 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
 --gpu-memory-utilization=0.9 \
 --tensor-parallel-size=$TP \
 --max-num-seqs=$CONC  \
---disable-log-requests
+--disable-log-requests &
 
 set +x
-until curl --output /dev/null --silent --head --fail http://localhost:$PORT/health; do
+until curl --output /dev/null --silent --fail http://localhost:$PORT/health; do
     sleep 5
 done
 

From 21ed06746baa45be52f65b820110bf7684d2cf01 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 12 Nov 2025 15:18:56 -0600
Subject: [PATCH 008/214] unescape sequences

---
 benchmarks/gptoss_fp4_h100_docker.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh
index e4efb03ec..5ba23d1ff 100644
--- a/benchmarks/gptoss_fp4_h100_docker.sh
+++ b/benchmarks/gptoss_fp4_h100_docker.sh
@@ -42,7 +42,7 @@ set -x
 python3 bench_serving/benchmark_serving.py \
 --model=$MODEL \
 --backend=vllm \
---base-url=\"http://localhost:$PORT\" \
+--base-url=http://localhost:$PORT \
 --dataset-name=random \
 --random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
 --num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \

From 65ef1f0b6232c5a513bb6bb1eb37c08d23038e58 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 12 Nov 2025 15:38:27 -0600
Subject: [PATCH 009/214] stop vllm to stdout after it stops

---
 benchmarks/gptoss_fp4_h100_docker.sh | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh
index 5ba23d1ff..ae5fb2a9f 100644
--- a/benchmarks/gptoss_fp4_h100_docker.sh
+++ b/benchmarks/gptoss_fp4_h100_docker.sh
@@ -31,11 +31,17 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
 --max-num-seqs=$CONC  \
 --disable-log-requests &
 
+SERVER_PID=$!
 set +x
+tail -f /tmp/vllm_server.log &
+TAIL_PID=$!
+
 until curl --output /dev/null --silent --fail http://localhost:$PORT/health; do
     sleep 5
 done
 
+kill $TAIL_PID 2>/dev/null
+
 pip install -q datasets pandas
 git clone https://github.com/kimbochen/bench_serving.git
 set -x

From cb557214884bcc93af777098f4b89ad1c40f3745 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 12 Nov 2025 15:41:45 -0600
Subject: [PATCH 010/214] stop vllm to stdout after it stops pt 2

---
 benchmarks/gptoss_fp4_h100_docker.sh | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh
index ae5fb2a9f..4ef463bf1 100644
--- a/benchmarks/gptoss_fp4_h100_docker.sh
+++ b/benchmarks/gptoss_fp4_h100_docker.sh
@@ -29,18 +29,16 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
 --gpu-memory-utilization=0.9 \
 --tensor-parallel-size=$TP \
 --max-num-seqs=$CONC  \
---disable-log-requests &
+--disable-log-requests 2>&1 | tee $(mktemp /tmp/server-XXXXXX.log) &
 
-SERVER_PID=$!
+VLLM_PID=$!
 set +x
-tail -f /tmp/vllm_server.log &
-TAIL_PID=$!
 
 until curl --output /dev/null --silent --fail http://localhost:$PORT/health; do
     sleep 5
 done
 
-kill $TAIL_PID 2>/dev/null
+pkill -P $$ tee 2>/dev/null
 
 pip install -q datasets pandas
 git clone https://github.com/kimbochen/bench_serving.git

From 788b7f1510031ecc98493b2c3e16ca14f16bb3bc Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 12 Nov 2025 15:47:37 -0600
Subject: [PATCH 011/214] get rid of docker stop as no longer in detatched

---
 runners/launch_h100-cr.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh
index 8553d9b59..a34b31c88 100644
--- a/runners/launch_h100-cr.sh
+++ b/runners/launch_h100-cr.sh
@@ -51,4 +51,4 @@ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_h100_docker.sh"
 # --result-dir=/workspace/ \
 # --result-filename=$RESULT_FILENAME.json"
 
-docker stop $server_name
+# docker stop $server_name

From a87e17496191406f8ed03245e26e077f5ff2661e Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 12 Nov 2025 16:01:09 -0600
Subject: [PATCH 012/214] clone bench serving to tmp dir

---
 benchmarks/gptoss_fp4_h100_docker.sh | 5 +++--
 runners/launch_h100-cr.sh            | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh
index 4ef463bf1..5420c220d 100644
--- a/benchmarks/gptoss_fp4_h100_docker.sh
+++ b/benchmarks/gptoss_fp4_h100_docker.sh
@@ -41,9 +41,10 @@ done
 pkill -P $$ tee 2>/dev/null
 
 pip install -q datasets pandas
-git clone https://github.com/kimbochen/bench_serving.git
+BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
+git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
 set -x
-python3 bench_serving/benchmark_serving.py \
+python3 $BENCH_SERVING_DIR/bench_serving/benchmark_serving.py \
 --model=$MODEL \
 --backend=vllm \
 --base-url=http://localhost:$PORT \
diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh
index a34b31c88..18c791614 100644
--- a/runners/launch_h100-cr.sh
+++ b/runners/launch_h100-cr.sh
@@ -12,7 +12,7 @@ docker run --rm --network=host --name=$server_name \
 -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \
--e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
+-e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
 --entrypoint=/bin/bash \
 $IMAGE \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_h100_docker.sh"

From c1d0a796e0fb75dd4369b0589997f5a9d56853d1 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 12 Nov 2025 16:07:27 -0600
Subject: [PATCH 013/214] clone bench serving to tmp dir pt 2

---
 benchmarks/gptoss_fp4_h100_docker.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh
index 5420c220d..2c8bfb3c5 100644
--- a/benchmarks/gptoss_fp4_h100_docker.sh
+++ b/benchmarks/gptoss_fp4_h100_docker.sh
@@ -44,7 +44,7 @@ pip install -q datasets pandas
 BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
 git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
 set -x
-python3 $BENCH_SERVING_DIR/bench_serving/benchmark_serving.py \
+python3 $BENCH_SERVING_DIR/benchmark_serving.py \
 --model=$MODEL \
 --backend=vllm \
 --base-url=http://localhost:$PORT \

From 4823afa516348ae4f1aac230d8bae751c1e2a91a Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 12 Nov 2025 16:14:36 -0600
Subject: [PATCH 014/214] add explanatory comment

---
 benchmarks/gptoss_fp4_h100_docker.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh
index 2c8bfb3c5..6229bed85 100644
--- a/benchmarks/gptoss_fp4_h100_docker.sh
+++ b/benchmarks/gptoss_fp4_h100_docker.sh
@@ -31,13 +31,12 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
 --max-num-seqs=$CONC  \
 --disable-log-requests 2>&1 | tee $(mktemp /tmp/server-XXXXXX.log) &
 
+# Show server logs til' it is up, then stop showing
 VLLM_PID=$!
 set +x
-
 until curl --output /dev/null --silent --fail http://localhost:$PORT/health; do
     sleep 5
 done
-
 pkill -P $$ tee 2>/dev/null
 
 pip install -q datasets pandas

From d52299fe46701c857925e507e39e62c134749701 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 12 Nov 2025 16:35:43 -0600
Subject: [PATCH 015/214] cleaning up

---
 runners/launch_h100-cr.sh | 37 -------------------------------------
 1 file changed, 37 deletions(-)

diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh
index 18c791614..d1ddc26de 100644
--- a/runners/launch_h100-cr.sh
+++ b/runners/launch_h100-cr.sh
@@ -4,7 +4,6 @@ HF_HUB_CACHE_MOUNT="/home/ubuntu/hf_hub_cache/"
 PORT=8888
 
 server_name="bmk-server"
-# client_name="bmk-client"
 
 set -x
 docker run --rm --network=host --name=$server_name \
@@ -16,39 +15,3 @@ docker run --rm --network=host --name=$server_name \
 --entrypoint=/bin/bash \
 $IMAGE \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_h100_docker.sh"
-
-# set +x
-# while IFS= read -r line; do
-#     printf '%s\n' "$line"
-#     if [[ "$line" =~ Application\ startup\ complete ]]; then
-#         break
-#     fi
-# done < <(docker logs -f --tail=0 $server_name 2>&1)
-
-# if ! docker ps --format "{{.Names}}" | grep -q "$server_name"; then
-#     echo "Server container launch failed."
-#     exit 1
-# fi
-
-# git clone https://github.com/kimbochen/bench_serving.git
-
-# set -x
-# docker run --rm --network=host --name=$client_name \
-# -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
-# -e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \
-# --entrypoint=/bin/bash \
-# $IMAGE \
-# -lc "pip install -q datasets pandas && \
-# python3 bench_serving/benchmark_serving.py \
-# --model=$MODEL \
-# --backend=vllm \
-# --base-url=\"http://localhost:$PORT\" \
-# --dataset-name=random \
-# --random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
-# --num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \
-# --request-rate=inf --ignore-eos \
-# --save-result --percentile-metrics='ttft,tpot,itl,e2el' \
-# --result-dir=/workspace/ \
-# --result-filename=$RESULT_FILENAME.json"
-
-# docker stop $server_name

From 85de6e752391380ec328ff119798cfab06695970 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 13 Nov 2025 09:13:56 -0600
Subject: [PATCH 016/214] cleaning up

---
 benchmarks/gptoss_fp4_mi355x_docker.sh | 38 +++++++++++++++++++++++++-
 runners/launch_mi355x-amd.sh           |  8 +++---
 2 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh
index 103e77fe3..de5ce9ce7 100644
--- a/benchmarks/gptoss_fp4_mi355x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi355x_docker.sh
@@ -30,4 +30,40 @@ vllm serve $MODEL --port $PORT \
 --block-size=64 \
 --no-enable-prefix-caching \
 --disable-log-requests \
---async-scheduling
+--async-scheduling | tee $(mktemp /tmp/server-XXXXXX.log) &
+
+# Show server logs til' it is up, then stop showing
+VLLM_PID=$!
+set +x
+until curl --output /dev/null --silent --fail http://localhost:$PORT/health; do
+    sleep 5
+done
+pkill -P $$ tee 2>/dev/null
+
+if [[ "$MODEL" == "amd/DeepSeek-R1-0528-MXFP4-Preview" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then
+  if [[ "$OSL" == "8192" ]]; then
+    NUM_PROMPTS=$(( CONC * 20 ))
+  else
+    NUM_PROMPTS=$(( CONC * 50 ))
+  fi
+else
+  NUM_PROMPTS=$(( CONC * 10 ))
+fi
+
+git clone https://github.com/kimbochen/bench_serving.git
+
+set -x
+docker run --rm --network=$network_name --name=$client_name \
+-v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
+-e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \
+--entrypoint=python3 \
+$IMAGE \
+bench_serving/benchmark_serving.py \
+--model=$MODEL --backend=vllm --base-url="http://localhost:$PORT" \
+--dataset-name=random \
+--random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
+--num-prompts=$NUM_PROMPTS \
+--max-concurrency=$CONC \
+--request-rate=inf --ignore-eos \
+--save-result --percentile-metrics="ttft,tpot,itl,e2el" \
+--result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json
diff --git a/runners/launch_mi355x-amd.sh b/runners/launch_mi355x-amd.sh
index 87ee8cbd2..b3cecf6e4 100644
--- a/runners/launch_mi355x-amd.sh
+++ b/runners/launch_mi355x-amd.sh
@@ -17,14 +17,14 @@
 HF_HUB_CACHE_MOUNT="/nfsdata/hf_hub_cache-1/"  # Temp solution
 PORT=8888
 
-network_name="bmk-net"
+# network_name="bmk-net"
 server_name="bmk-server"
-client_name="bmk-client"
+# client_name="bmk-client"
 
-docker network create $network_name
+# docker network create $network_name
 
 set -x
-docker run --rm -d --ipc=host --shm-size=16g --network=$network_name --name=$server_name \
+docker run --rm -d --ipc=host --shm-size=16g --network=host --name=$server_name \
 --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \
 --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
 -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \

From 48f7588da8b40be15bf190e9b4926f08390a17e6 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 13 Nov 2025 10:57:36 -0600
Subject: [PATCH 017/214] adding mi355x refactor

---
 benchmarks/dsr1_fp8_mi355x_docker.sh   | 33 ++++++++++-
 benchmarks/gptoss_fp4_h100_docker.sh   |  1 -
 benchmarks/gptoss_fp4_mi355x_docker.sh | 24 ++------
 runners/launch_mi355x-amd.sh           | 78 +++++++++++++-------------
 4 files changed, 76 insertions(+), 60 deletions(-)

diff --git a/benchmarks/dsr1_fp8_mi355x_docker.sh b/benchmarks/dsr1_fp8_mi355x_docker.sh
index f39a8dbbd..baad70fd8 100644
--- a/benchmarks/dsr1_fp8_mi355x_docker.sh
+++ b/benchmarks/dsr1_fp8_mi355x_docker.sh
@@ -24,5 +24,36 @@ python3 -m sglang.launch_server \
     --mem-fraction-static 0.8 --disable-radix-cache \
     --num-continuous-decode-steps 4 \
     --max-prefill-tokens 196608 \
-    --cuda-graph-max-bs 128
+    --cuda-graph-max-bs 128 | tee $(mktemp /tmp/server-XXXXXX.log) &
+
+set +x
+until curl --output /dev/null --silent --fail http://localhost:$PORT/health; do
+    sleep 5
+done
+pkill -P $$ tee 2>/dev/null
+
+if [[ "$MODEL" == "amd/DeepSeek-R1-0528-MXFP4-Preview" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then
+  if [[ "$OSL" == "8192" ]]; then
+    NUM_PROMPTS=$(( CONC * 20 ))
+  else
+    NUM_PROMPTS=$(( CONC * 50 ))
+  fi
+else
+  NUM_PROMPTS=$(( CONC * 10 ))
+fi
+
+BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
+git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
+set -x
+python3 $BENCH_SERVING_DIR/benchmark_serving.py \
+--model=$MODEL --backend=vllm --base-url="http://localhost:$PORT" \
+--dataset-name=random \
+--random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
+--num-prompts=$NUM_PROMPTS \
+--max-concurrency=$CONC \
+--request-rate=inf --ignore-eos \
+--save-result --percentile-metrics="ttft,tpot,itl,e2el" \
+--result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json
+
+
     
diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh
index 6229bed85..1b4453be3 100644
--- a/benchmarks/gptoss_fp4_h100_docker.sh
+++ b/benchmarks/gptoss_fp4_h100_docker.sh
@@ -32,7 +32,6 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
 --disable-log-requests 2>&1 | tee $(mktemp /tmp/server-XXXXXX.log) &
 
 # Show server logs til' it is up, then stop showing
-VLLM_PID=$!
 set +x
 until curl --output /dev/null --silent --fail http://localhost:$PORT/health; do
     sleep 5
diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh
index de5ce9ce7..533f5e212 100644
--- a/benchmarks/gptoss_fp4_mi355x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi355x_docker.sh
@@ -8,6 +8,8 @@
 # TP
 # CONC
 # MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# RESULT_FILENAME
 
 cat > config.yaml << EOF
 compilation-config: '{"compile_sizes":[1,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98,100,102,104,106,108,110,112,114,116,118,120,122,124,126,128,256,512,1024,2048,8192] , "cudagraph_capture_sizes":[1,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98,100,102,104,106,108,110,112,114,116,118,120,122,124,126,128,136,144,152,160,168,176,184,192,200,208,216,224,232,240,248,256,264,272,280,288,296,304,312,320,328,336,344,352,360,368,376,384,392,400,408,416,424,432,440,448,456,464,472,480,488,496,504,512,520,528,536,544,552,560,568,576,584,592,600,608,616,624,632,640,648,656,664,672,680,688,696,704,712,720,728,736,744,752,760,768,776,784,792,800,808,816,824,832,840,848,856,864,872,880,888,896,904,912,920,928,936,944,952,960,968,976,984,992,1000,1008,1016,1024,2048,4096,8192] , "cudagraph_mode": "FULL_AND_PIECEWISE"}' 
@@ -33,32 +35,16 @@ vllm serve $MODEL --port $PORT \
 --async-scheduling | tee $(mktemp /tmp/server-XXXXXX.log) &
 
 # Show server logs til' it is up, then stop showing
-VLLM_PID=$!
 set +x
 until curl --output /dev/null --silent --fail http://localhost:$PORT/health; do
     sleep 5
 done
 pkill -P $$ tee 2>/dev/null
 
-if [[ "$MODEL" == "amd/DeepSeek-R1-0528-MXFP4-Preview" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then
-  if [[ "$OSL" == "8192" ]]; then
-    NUM_PROMPTS=$(( CONC * 20 ))
-  else
-    NUM_PROMPTS=$(( CONC * 50 ))
-  fi
-else
-  NUM_PROMPTS=$(( CONC * 10 ))
-fi
-
-git clone https://github.com/kimbochen/bench_serving.git
-
+BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
+git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
 set -x
-docker run --rm --network=$network_name --name=$client_name \
--v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
--e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \
---entrypoint=python3 \
-$IMAGE \
-bench_serving/benchmark_serving.py \
+python3 $BENCH_SERVING_DIR/benchmark_serving.py \
 --model=$MODEL --backend=vllm --base-url="http://localhost:$PORT" \
 --dataset-name=random \
 --random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
diff --git a/runners/launch_mi355x-amd.sh b/runners/launch_mi355x-amd.sh
index b3cecf6e4..009a53108 100644
--- a/runners/launch_mi355x-amd.sh
+++ b/runners/launch_mi355x-amd.sh
@@ -24,52 +24,52 @@ server_name="bmk-server"
 # docker network create $network_name
 
 set -x
-docker run --rm -d --ipc=host --shm-size=16g --network=host --name=$server_name \
+docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
 --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \
 --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
 -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \
--e ISL -e OSL \
+-e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME  \
 --entrypoint=/bin/bash \
 $IMAGE \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi355x_docker.sh"
 
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" =~ Application\ startup\ complete ]]; then
-        break
-    fi
-done < <(docker logs -f --tail=0 $server_name 2>&1)
+# set +x
+# while IFS= read -r line; do
+#     printf '%s\n' "$line"
+#     if [[ "$line" =~ Application\ startup\ complete ]]; then
+#         break
+#     fi
+# done < <(docker logs -f --tail=0 $server_name 2>&1)
 
-if [[ "$MODEL" == "amd/DeepSeek-R1-0528-MXFP4-Preview" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then
-  if [[ "$OSL" == "8192" ]]; then
-    NUM_PROMPTS=$(( CONC * 20 ))
-  else
-    NUM_PROMPTS=$(( CONC * 50 ))
-  fi
-else
-  NUM_PROMPTS=$(( CONC * 10 ))
-fi
+# if [[ "$MODEL" == "amd/DeepSeek-R1-0528-MXFP4-Preview" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then
+#   if [[ "$OSL" == "8192" ]]; then
+#     NUM_PROMPTS=$(( CONC * 20 ))
+#   else
+#     NUM_PROMPTS=$(( CONC * 50 ))
+#   fi
+# else
+#   NUM_PROMPTS=$(( CONC * 10 ))
+# fi
 
-git clone https://github.com/kimbochen/bench_serving.git
+# git clone https://github.com/kimbochen/bench_serving.git
 
-set -x
-docker run --rm --network=$network_name --name=$client_name \
--v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
--e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \
---entrypoint=python3 \
-$IMAGE \
-bench_serving/benchmark_serving.py \
---model=$MODEL --backend=vllm --base-url="http://$server_name:$PORT" \
---dataset-name=random \
---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
---num-prompts=$NUM_PROMPTS \
---max-concurrency=$CONC \
---request-rate=inf --ignore-eos \
---save-result --percentile-metrics="ttft,tpot,itl,e2el" \
---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json
+# set -x
+# docker run --rm --network=$network_name --name=$client_name \
+# -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
+# -e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \
+# --entrypoint=python3 \
+# $IMAGE \
+# bench_serving/benchmark_serving.py \
+# --model=$MODEL --backend=vllm --base-url="http://$server_name:$PORT" \
+# --dataset-name=random \
+# --random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
+# --num-prompts=$NUM_PROMPTS \
+# --max-concurrency=$CONC \
+# --request-rate=inf --ignore-eos \
+# --save-result --percentile-metrics="ttft,tpot,itl,e2el" \
+# --result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json
 
 if ls gpucore.* 1> /dev/null 2>&1; then
   echo "gpucore files exist. not good"
@@ -77,8 +77,8 @@ if ls gpucore.* 1> /dev/null 2>&1; then
 fi
 
 
-while [ -n "$(docker ps -aq)" ]; do
-    docker stop $server_name
-    docker network rm $network_name
-    sleep 5
-done
+# while [ -n "$(docker ps -aq)" ]; do
+#     docker stop $server_name
+#     # docker network rm $network_name
+#     sleep 5
+# done

From faec31e7d9a7cab97d396e9f2533e8e8b8690728 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 13 Nov 2025 11:37:44 -0600
Subject: [PATCH 018/214] adding h200 initial refactor

---
 benchmarks/gptoss_fp4_h200_trt_slurm.sh | 34 +++++++++++++++++--------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh
index c148a3cb7..7e411b05a 100644
--- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh
+++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh
@@ -19,7 +19,7 @@
 echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 
 hf download $MODEL
-SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+# SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 PORT=$(( 8888 + $PORT_OFFSET ))
 
 
@@ -45,20 +45,32 @@ stream_interval: 20
 EOF
 
 #mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens $MAX_MODEL_LEN --num_postprocess_workers 2 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 &
-mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --max_batch_size $CONC --max_num_tokens 20000 --backend pytorch --extra_llm_api_options gptoss-config.yml  --ep_size=$EP_SIZE --trust_remote_code --gpus_per_node 8 --host 0.0.0.0 --port $PORT --tp_size=$TP --pp_size=1 > $SERVER_LOG 2>&1 &
-
+mpirun -n 1 --oversubscribe --allow-run-as-root \
+trtllm-serve $MODEL \
+--max_batch_size $CONC \
+--max_num_tokens 20000 \
+--backend pytorch \
+--extra_llm_api_options gptoss-config.yml \
+--ep_size=$EP_SIZE \
+--trust_remote_code \
+--gpus_per_node 8 \
+--host 0.0.0.0 \
+--port $PORT \
+--tp_size=$TP \
+--pp_size=1 \
+2>&1 | tee $(mktemp /tmp/server-XXXXXX.log) &
 
+# Show server logs til' it is up, then stop showing
 set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
+until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
+    sleep 5
+done
+pkill -P $$ tee 2>/dev/null
 
 set -x
-git clone https://github.com/kimbochen/bench_serving.git
-python3 bench_serving/benchmark_serving.py \
+BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
+git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
+python3 $BENCH_SERVING_DIR/benchmark_serving.py \
 --model $MODEL --backend openai \
 --base-url http://0.0.0.0:$PORT \
 --dataset-name random \

From 1ef1b23a56287f129936a1c82f9d594db9410f7a Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 13 Nov 2025 15:46:11 -0600
Subject: [PATCH 019/214] different way to see server logs

---
 benchmarks/gptoss_fp4_h200_trt_slurm.sh | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh
index 7e411b05a..ac084ef13 100644
--- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh
+++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh
@@ -44,6 +44,8 @@ print_iter_log: true
 stream_interval: 20 
 EOF
 
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+
 #mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens $MAX_MODEL_LEN --num_postprocess_workers 2 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 &
 mpirun -n 1 --oversubscribe --allow-run-as-root \
 trtllm-serve $MODEL \
@@ -58,14 +60,18 @@ trtllm-serve $MODEL \
 --port $PORT \
 --tp_size=$TP \
 --pp_size=1 \
-2>&1 | tee $(mktemp /tmp/server-XXXXXX.log) &
+> $SERVER_LOG 2>&1 &
+
+# Show logs until server is ready
+tail -f $SERVER_LOG &
+TAIL_PID=$!
 
 # Show server logs til' it is up, then stop showing
 set +x
 until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
     sleep 5
 done
-pkill -P $$ tee 2>/dev/null
+kill $TAIL_PID
 
 set -x
 BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)

From 75523ee0b8f56e252b7f209a01a52743047e2913 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 13 Nov 2025 15:59:58 -0600
Subject: [PATCH 020/214] cleanup

---
 benchmarks/dsr1_fp8_h200_trt_slurm.sh   | 18 ++++++++++--------
 benchmarks/gptoss_fp4_h100_docker.sh    | 13 ++++++++-----
 benchmarks/gptoss_fp4_h100_slurm.sh     | 18 ++++++++++--------
 benchmarks/gptoss_fp4_h200_trt_slurm.sh |  7 +------
 benchmarks/gptoss_fp4_mi355x_slurm.sh   | 18 ++++++++++--------
 5 files changed, 39 insertions(+), 35 deletions(-)

diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
index 7b566c0ab..58d7e9724 100644
--- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
@@ -71,17 +71,19 @@ PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \
     > $SERVER_LOG 2>&1 &
 
 
+# Show logs until server is ready
+tail -f $SERVER_LOG &
+TAIL_PID=$!
 set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
+until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
+    sleep 5
+done
+kill $TAIL_PID
 
-git clone https://github.com/kimbochen/bench_serving.git
 set -x
-python3 bench_serving/benchmark_serving.py \
+BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
+git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
+python3 $BENCH_SERVING_DIR/benchmark_serving.py \
 --model $MODEL --backend openai \
 --base-url http://0.0.0.0:$PORT \
 --dataset-name random \
diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh
index 1b4453be3..ae889474c 100644
--- a/benchmarks/gptoss_fp4_h100_docker.sh
+++ b/benchmarks/gptoss_fp4_h100_docker.sh
@@ -22,6 +22,7 @@ max-model-len: 10240
 EOF
 
 export PYTHONNOUSERSITE=1
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 
 set -x
 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
@@ -29,19 +30,21 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
 --gpu-memory-utilization=0.9 \
 --tensor-parallel-size=$TP \
 --max-num-seqs=$CONC  \
---disable-log-requests 2>&1 | tee $(mktemp /tmp/server-XXXXXX.log) &
+--disable-log-requests > $SERVER_LOG 2>&1 &
 
-# Show server logs til' it is up, then stop showing
+# Show logs until server is ready
+tail -f $SERVER_LOG &
+TAIL_PID=$!
 set +x
-until curl --output /dev/null --silent --fail http://localhost:$PORT/health; do
+until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
     sleep 5
 done
-pkill -P $$ tee 2>/dev/null
+kill $TAIL_PID
 
 pip install -q datasets pandas
+set -x
 BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
 git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
-set -x
 python3 $BENCH_SERVING_DIR/benchmark_serving.py \
 --model=$MODEL \
 --backend=vllm \
diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh
index e9092703a..d82bebf72 100644
--- a/benchmarks/gptoss_fp4_h100_slurm.sh
+++ b/benchmarks/gptoss_fp4_h100_slurm.sh
@@ -35,18 +35,20 @@ PYTHONNOUSERSITE=1 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
 --max-num-seqs=$CONC  \
 --disable-log-requests > $SERVER_LOG 2>&1 &
 
+# Show logs until server is ready
+tail -f $SERVER_LOG &
+TAIL_PID=$!
 set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
+until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
+    sleep 5
+done
+kill $TAIL_PID
 
 pip install -q datasets pandas
-git clone https://github.com/kimbochen/bench_serving.git
 set -x
-python3 bench_serving/benchmark_serving.py \
+BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
+git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
+python3 $BENCH_SERVING_DIR/benchmark_serving.py \
 --model=$MODEL \
 --backend=vllm \
 --base-url="http://0.0.0.0:$PORT" \
diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh
index ac084ef13..0927a0d61 100644
--- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh
+++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh
@@ -19,7 +19,7 @@
 echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 
 hf download $MODEL
-# SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 PORT=$(( 8888 + $PORT_OFFSET ))
 
 
@@ -44,9 +44,6 @@ print_iter_log: true
 stream_interval: 20 
 EOF
 
-SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
-
-#mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens $MAX_MODEL_LEN --num_postprocess_workers 2 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 &
 mpirun -n 1 --oversubscribe --allow-run-as-root \
 trtllm-serve $MODEL \
 --max_batch_size $CONC \
@@ -65,8 +62,6 @@ trtllm-serve $MODEL \
 # Show logs until server is ready
 tail -f $SERVER_LOG &
 TAIL_PID=$!
-
-# Show server logs til' it is up, then stop showing
 set +x
 until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
     sleep 5
diff --git a/benchmarks/gptoss_fp4_mi355x_slurm.sh b/benchmarks/gptoss_fp4_mi355x_slurm.sh
index 657bc1fdf..1fcba771f 100644
--- a/benchmarks/gptoss_fp4_mi355x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi355x_slurm.sh
@@ -38,17 +38,19 @@ vllm serve $MODEL --port $PORT \
 --disable-log-requests \
 --async-scheduling > $SERVER_LOG 2>&1 &
 
+# Show logs until server is ready
+tail -f $SERVER_LOG &
+TAIL_PID=$!
 set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
+until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
+    sleep 5
+done
+kill $TAIL_PID
 
 set -x
-git clone https://github.com/kimbochen/bench_serving.git
-python3 bench_serving/benchmark_serving.py \
+BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
+git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
+python3 $BENCH_SERVING_DIR/benchmark_serving.py \
 --model $MODEL --backend vllm \
 --base-url "http://0.0.0.0:$PORT" \
 --dataset-name random \

From 25366523662ce8a6ae7efeb36f289636f189d125 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 13 Nov 2025 16:39:39 -0600
Subject: [PATCH 021/214] now fail if server fails

---
 benchmarks/dsr1_fp8_h200_trt_slurm.sh | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
index 58d7e9724..a44769bc6 100644
--- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
@@ -69,6 +69,7 @@ PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \
     --tp_size=$TP --ep_size=$EP_SIZE \
     --extra_llm_api_options=$EXTRA_CONFIG_FILE \
     > $SERVER_LOG 2>&1 &
+SERVER_PID=$!
 
 
 # Show logs until server is ready
@@ -76,6 +77,10 @@ tail -f $SERVER_LOG &
 TAIL_PID=$!
 set +x
 until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
+    if ! kill -0 $SERVER_PID 2>/dev/null; then
+        echo "Server died before becoming healthy. Exiting."
+        exit 1
+    fi
     sleep 5
 done
 kill $TAIL_PID

From 2d58f0df37f4ac456e5f0fff95fbc16d3df4a7e5 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 13 Nov 2025 17:10:51 -0600
Subject: [PATCH 022/214] starting on b200

---
 benchmarks/gptoss_fp4_b200_docker.sh | 28 +++++++++++++++++++++--
 runners/launch_b200-tg.sh            | 34 +---------------------------
 2 files changed, 27 insertions(+), 35 deletions(-)

diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh
index fd6ac15c5..28f3d29cf 100644
--- a/benchmarks/gptoss_fp4_b200_docker.sh
+++ b/benchmarks/gptoss_fp4_b200_docker.sh
@@ -43,8 +43,32 @@ export TORCH_CUDA_ARCH_LIST="10.0"
 export VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB='{"2":32,"4":32,"8":8}'
 export PYTHONNOUSERSITE=1
 export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 
 set -x
 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \
---gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs 512 \
---disable-log-requests
+--gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC \
+--disable-log-requests > $SERVER_LOG 2>&1 &
+
+# Show logs until server is ready
+tail -f $SERVER_LOG &
+TAIL_PID=$!
+set +x
+until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
+    sleep 5
+done
+kill $TAIL_PID
+
+pip install -q datasets pandas
+set -x
+BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
+git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
+python3 $BENCH_SERVING_DIR/benchmark_serving.py \
+--model $MODEL  --backend vllm --base-url http://localhost:$PORT \
+--dataset-name random \
+--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
+--num-prompts $(( $CONC * 10 )) \
+--max-concurrency $CONC \
+--request-rate inf --ignore-eos \
+--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
+--result-dir /workspace/ --result-filename $RESULT_FILENAME.json
\ No newline at end of file
diff --git a/runners/launch_b200-tg.sh b/runners/launch_b200-tg.sh
index 9f313396c..97e975a64 100644
--- a/runners/launch_b200-tg.sh
+++ b/runners/launch_b200-tg.sh
@@ -5,7 +5,6 @@ FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
 PORT=8888
 
 server_name="bmk-server"
-client_name="bmk-client"
 
 set -x
 docker run --rm -d --network host --name $server_name \
@@ -14,38 +13,7 @@ docker run --rm -d --network host --name $server_name \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \
 -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
+-e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \
 --entrypoint=/bin/bash \
 $(echo "$IMAGE" | sed 's/#/\//') \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh"
-
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" =~ Application\ startup\ complete ]]; then
-        break
-    fi
-done < <(docker logs -f --tail=0 $server_name 2>&1)
-
-git clone https://github.com/kimbochen/bench_serving.git
-
-set -x
-docker run --rm --network host --name $client_name \
--v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
--e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \
---entrypoint=/bin/bash \
-$(echo "$IMAGE" | sed 's/#/\//') \
--lc "pip install -q datasets pandas && \
-python3 bench_serving/benchmark_serving.py \
---model $MODEL  --backend vllm --base-url http://localhost:$PORT \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) \
---max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ --result-filename $RESULT_FILENAME.json"
-
-while [ -n "$(docker ps -aq)" ]; do
-    docker stop $server_name
-    sleep 5
-done

From f5cf4a7167687ac86d9019edcf21cbc01631ab7c Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 13 Nov 2025 17:20:53 -0600
Subject: [PATCH 023/214] doign b200

---
 benchmarks/dsr1_fp4_b200_docker.sh    | 27 ++++++++++++++++++++++-
 benchmarks/dsr1_fp4_b200_trt_slurm.sh | 18 +++++++++-------
 benchmarks/dsr1_fp8_b200_docker.sh    | 26 +++++++++++++++++++++-
 benchmarks/dsr1_fp8_b200_trt_slurm.sh | 31 +++++++++++++++------------
 benchmarks/gptoss_fp4_b200_docker.sh  |  2 ++
 5 files changed, 80 insertions(+), 24 deletions(-)

diff --git a/benchmarks/dsr1_fp4_b200_docker.sh b/benchmarks/dsr1_fp4_b200_docker.sh
index 3c8232072..6b2112478 100644
--- a/benchmarks/dsr1_fp4_b200_docker.sh
+++ b/benchmarks/dsr1_fp4_b200_docker.sh
@@ -6,6 +6,8 @@ nvidia-smi
 # happens 1% of the time. ref: https://github.com/flashinfer-ai/flashinfer/pull/1779
 sed -i '102,108d' /usr/local/lib/python3.12/dist-packages/flashinfer/jit/cubin_loader.py
 
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+
 # Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls.
 if [[ $CONC -ge 16 ]]; then
   SCHEDULER_RECV_INTERVAL=30
@@ -22,5 +24,28 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.
 --cuda-graph-max-bs 256 --max-running-requests 256 --mem-fraction-static 0.85 --kv-cache-dtype fp8_e4m3 \
 --chunked-prefill-size 16384 \
 --ep-size $EP_SIZE --quantization modelopt_fp4 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
---enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10
+--enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 > $SERVER_LOG 2>&1 &
+
+# Show logs until server is ready
+tail -f $SERVER_LOG &
+TAIL_PID=$!
+set +x
+until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
+    sleep 5
+done
+kill $TAIL_PID
+
+pip install -q datasets pandas
+set -x
+BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
+git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
+python3 $BENCH_SERVING_DIR/benchmark_serving.py \
+--model $MODEL  --backend vllm --base-url http://localhost:$PORT \
+--dataset-name random \
+--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
+--num-prompts $(( $CONC * 10 )) \
+--max-concurrency $CONC \
+--request-rate inf --ignore-eos \
+--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
+--result-dir /workspace/ --result-filename $RESULT_FILENAME.json
 
diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
index 6f4f814a0..6896880fb 100644
--- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
@@ -101,17 +101,19 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \
     > $SERVER_LOG 2>&1 &
 
 
+# Show logs until server is ready
+tail -f $SERVER_LOG &
+TAIL_PID=$!
 set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
+until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
+    sleep 5
+done
+kill $TAIL_PID
 
-git clone https://github.com/kimbochen/bench_serving.git
 set -x
-python3 bench_serving/benchmark_serving.py \
+BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
+git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
+python3 $BENCH_SERVING_DIR/benchmark_serving.py \
 --model $MODEL --backend openai \
 --base-url http://0.0.0.0:$PORT \
 --dataset-name random \
diff --git a/benchmarks/dsr1_fp8_b200_docker.sh b/benchmarks/dsr1_fp8_b200_docker.sh
index 361b6f1f6..babb5c9a6 100644
--- a/benchmarks/dsr1_fp8_b200_docker.sh
+++ b/benchmarks/dsr1_fp8_b200_docker.sh
@@ -17,6 +17,7 @@ sed -i '102,108d' /usr/local/lib/python3.12/dist-packages/flashinfer/jit/cubin_l
 
 export SGL_ENABLE_JIT_DEEPGEMM=false
 export SGLANG_ENABLE_FLASHINFER_GEMM=true
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 
 # Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls.
 if [[ $CONC -ge 16 ]]; then
@@ -34,4 +35,27 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.
 --cuda-graph-max-bs 128 --max-running-requests 128 \
 --mem-fraction-static 0.82 --kv-cache-dtype fp8_e4m3 --chunked-prefill-size 32768 --max-prefill-tokens 32768 \
 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL --disable-radix-cache \
---attention-backend trtllm_mla --stream-interval 30 --ep-size $EP_SIZE --moe-runner-backend flashinfer_trtllm --quantization fp8
+--attention-backend trtllm_mla --stream-interval 30 --ep-size $EP_SIZE --moe-runner-backend flashinfer_trtllm --quantization fp8 > $SERVER_LOG 2>&1 &
+
+# Show logs until server is ready
+tail -f $SERVER_LOG &
+TAIL_PID=$!
+set +x
+until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
+    sleep 5
+done
+kill $TAIL_PID
+
+pip install -q datasets pandas
+set -x
+BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
+git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
+python3 $BENCH_SERVING_DIR/benchmark_serving.py \
+--model $MODEL  --backend vllm --base-url http://localhost:$PORT \
+--dataset-name random \
+--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
+--num-prompts $(( $CONC * 10 )) \
+--max-concurrency $CONC \
+--request-rate inf --ignore-eos \
+--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
+--result-dir /workspace/ --result-filename $RESULT_FILENAME.json
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh
index 58d4525f1..81fc4137b 100644
--- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh
@@ -69,25 +69,28 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \
     --tp_size=$TP --ep_size=$EP_SIZE \
     --extra_llm_api_options=$EXTRA_CONFIG_FILE \
     > $SERVER_LOG 2>&1 &
+    
+SERVER_PID=$!
 
-
+# Show logs until server is ready
+tail -f $SERVER_LOG &
+TAIL_PID=$!
 set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
+until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
+    sleep 5
+done
+kill $TAIL_PID
 
-git clone https://github.com/kimbochen/bench_serving.git
+pip install -q datasets pandas
 set -x
-python3 bench_serving/benchmark_serving.py \
---model $MODEL --backend openai \
---base-url http://0.0.0.0:$PORT \
+BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
+git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
+python3 $BENCH_SERVING_DIR/benchmark_serving.py \
+--model $MODEL  --backend vllm --base-url http://localhost:$PORT \
 --dataset-name random \
 --random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
+--num-prompts $(( $CONC * 10 )) \
+--max-concurrency $CONC \
 --request-rate inf --ignore-eos \
 --save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json
+--result-dir /workspace/ --result-filename $RESULT_FILENAME.json
diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh
index 28f3d29cf..f28f525c0 100644
--- a/benchmarks/gptoss_fp4_b200_docker.sh
+++ b/benchmarks/gptoss_fp4_b200_docker.sh
@@ -50,6 +50,8 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \
 --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC \
 --disable-log-requests > $SERVER_LOG 2>&1 &
 
+SERVER_PID=$!
+
 # Show logs until server is ready
 tail -f $SERVER_LOG &
 TAIL_PID=$!

From 92af70bc77fbdfa00763ab51d493e559ac3d3e78 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 13 Nov 2025 17:22:18 -0600
Subject: [PATCH 024/214] reverting erroneous change

---
 benchmarks/dsr1_fp8_b200_trt_slurm.sh | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh
index 81fc4137b..741ecdb92 100644
--- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh
@@ -69,7 +69,7 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \
     --tp_size=$TP --ep_size=$EP_SIZE \
     --extra_llm_api_options=$EXTRA_CONFIG_FILE \
     > $SERVER_LOG 2>&1 &
-    
+
 SERVER_PID=$!
 
 # Show logs until server is ready
@@ -81,16 +81,16 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
 done
 kill $TAIL_PID
 
-pip install -q datasets pandas
 set -x
 BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
 git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
 python3 $BENCH_SERVING_DIR/benchmark_serving.py \
---model $MODEL  --backend vllm --base-url http://localhost:$PORT \
+--model $MODEL --backend openai \
+--base-url http://0.0.0.0:$PORT \
 --dataset-name random \
 --random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) \
---max-concurrency $CONC \
+--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
 --request-rate inf --ignore-eos \
 --save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ --result-filename $RESULT_FILENAME.json
+--result-dir /workspace/ \
+--result-filename $RESULT_FILENAME.json

From f330d672617cd1176b0eb771b5525617858444d7 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 13 Nov 2025 18:00:48 -0600
Subject: [PATCH 025/214] fixing b200

---
 benchmarks/dsr1_fp4_b200_docker.sh   |  2 +-
 benchmarks/dsr1_fp8_b200_docker.sh   |  2 +-
 benchmarks/gptoss_fp4_b200_docker.sh |  2 +-
 runners/launch_b200-nvd.sh           | 44 +++++-----------------------
 4 files changed, 11 insertions(+), 39 deletions(-)

diff --git a/benchmarks/dsr1_fp4_b200_docker.sh b/benchmarks/dsr1_fp4_b200_docker.sh
index 6b2112478..8b9f116c6 100644
--- a/benchmarks/dsr1_fp4_b200_docker.sh
+++ b/benchmarks/dsr1_fp4_b200_docker.sh
@@ -43,7 +43,7 @@ python3 $BENCH_SERVING_DIR/benchmark_serving.py \
 --model $MODEL  --backend vllm --base-url http://localhost:$PORT \
 --dataset-name random \
 --random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) \
+--num-prompts $NUM_PROMPTS \
 --max-concurrency $CONC \
 --request-rate inf --ignore-eos \
 --save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
diff --git a/benchmarks/dsr1_fp8_b200_docker.sh b/benchmarks/dsr1_fp8_b200_docker.sh
index babb5c9a6..f1412264c 100644
--- a/benchmarks/dsr1_fp8_b200_docker.sh
+++ b/benchmarks/dsr1_fp8_b200_docker.sh
@@ -54,7 +54,7 @@ python3 $BENCH_SERVING_DIR/benchmark_serving.py \
 --model $MODEL  --backend vllm --base-url http://localhost:$PORT \
 --dataset-name random \
 --random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) \
+--num-prompts $NUM_PROMPTS \
 --max-concurrency $CONC \
 --request-rate inf --ignore-eos \
 --save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh
index f28f525c0..530e61373 100644
--- a/benchmarks/gptoss_fp4_b200_docker.sh
+++ b/benchmarks/gptoss_fp4_b200_docker.sh
@@ -69,7 +69,7 @@ python3 $BENCH_SERVING_DIR/benchmark_serving.py \
 --model $MODEL  --backend vllm --base-url http://localhost:$PORT \
 --dataset-name random \
 --random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) \
+--num-prompts $NUM_PROMPTS \
 --max-concurrency $CONC \
 --request-rate inf --ignore-eos \
 --save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
diff --git a/runners/launch_b200-nvd.sh b/runners/launch_b200-nvd.sh
index 21a10d48f..a2587b477 100644
--- a/runners/launch_b200-nvd.sh
+++ b/runners/launch_b200-nvd.sh
@@ -25,29 +25,6 @@ set -x
 # Disabling it can reduce perf but will improve CI stability. i.e. we won't see vLLM/Sglang crashes.
 # Ref: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-graph-register
 
-
-docker run --rm -d --init --network host --name $server_name \
---runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \
--v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
--v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
--e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \
--e NCCL_GRAPH_REGISTER=0 \
--e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
---entrypoint=/bin/bash \
-$(echo "$IMAGE" | sed 's/#/\//') \
-benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh"
-
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" =~ Application\ startup\ complete ]]; then
-        break
-    fi
-done < <(docker logs -f --tail=0 $server_name 2>&1)
-
-git clone https://github.com/kimbochen/bench_serving.git
-
-
 if [[ "$MODEL" == "nvidia/DeepSeek-R1-0528-FP4" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then
   if [[ "$OSL" == "8192" ]]; then
     NUM_PROMPTS=$(( CONC * 20 ))
@@ -58,22 +35,17 @@ else
   NUM_PROMPTS=$(( CONC * 10 ))
 fi
 
-set -x
-docker run --rm --network host --name $client_name \
+docker run --rm --init --network host --name $server_name \
+--runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \
+-v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
--e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \
+-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \
+-e NCCL_GRAPH_REGISTER=0 \
+-e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
+-e -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e NUM_PROMPTS \
 --entrypoint=/bin/bash \
 $(echo "$IMAGE" | sed 's/#/\//') \
--lc "pip install -q datasets pandas && \
-python3 bench_serving/benchmark_serving.py \
---model $MODEL  --backend vllm --base-url http://localhost:$PORT \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $NUM_PROMPTS \
---max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ --result-filename $RESULT_FILENAME.json"
+benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh"
 
 # Try graceful first
 docker stop -t 90 "$server_name" || true

From c5fcf816fc85d099bd1bce0d5597f065892d6eed Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 13 Nov 2025 18:02:36 -0600
Subject: [PATCH 026/214] fixing b200 pt 2

---
 runners/launch_b200-nvd.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runners/launch_b200-nvd.sh b/runners/launch_b200-nvd.sh
index a2587b477..47c7c979f 100644
--- a/runners/launch_b200-nvd.sh
+++ b/runners/launch_b200-nvd.sh
@@ -42,7 +42,7 @@ docker run --rm --init --network host --name $server_name \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \
 -e NCCL_GRAPH_REGISTER=0 \
 -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
--e -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e NUM_PROMPTS \
+-e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e NUM_PROMPTS \
 --entrypoint=/bin/bash \
 $(echo "$IMAGE" | sed 's/#/\//') \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh"

From 3ededf0a1ec9dab24326784b18107d2abb0da88c Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 13 Nov 2025 18:16:35 -0600
Subject: [PATCH 027/214] updating mi300

---
 benchmarks/dsr1_fp8_mi300x_docker.sh   | 27 ++++++++++++++++-
 benchmarks/dsr1_fp8_mi300x_slurm.sh    | 18 ++++++------
 benchmarks/gptoss_fp4_b200_docker.sh   |  3 +-
 benchmarks/gptoss_fp4_mi300x_docker.sh | 26 ++++++++++++++++-
 benchmarks/gptoss_fp4_mi300x_slurm.sh  | 19 ++++++------
 runners/launch_b200-nvd.sh             |  6 ++--
 runners/launch_mi300x-amd.sh           | 40 ++------------------------
 7 files changed, 78 insertions(+), 61 deletions(-)

diff --git a/benchmarks/dsr1_fp8_mi300x_docker.sh b/benchmarks/dsr1_fp8_mi300x_docker.sh
index fca44bcf1..82cb4fbee 100644
--- a/benchmarks/dsr1_fp8_mi300x_docker.sh
+++ b/benchmarks/dsr1_fp8_mi300x_docker.sh
@@ -24,6 +24,8 @@ fi
 
 export SGLANG_USE_AITER=1
 
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+
 set -x
 python3 -m sglang.launch_server \
 --model-path=$MODEL --host=0.0.0.0 --port=$PORT --trust-remote-code \
@@ -33,4 +35,27 @@ python3 -m sglang.launch_server \
 --chunked-prefill-size=196608 \
 --num-continuous-decode-steps=4 \
 --max-prefill-tokens=196608 \
---disable-radix-cache
+--disable-radix-cache > $SERVER_LOG 2>&1 &
+
+
+# Show logs until server is ready
+tail -f $SERVER_LOG &
+TAIL_PID=$!
+set +x
+until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
+    sleep 5
+done
+kill $TAIL_PID
+
+set -x
+BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
+git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
+python3 $BENCH_SERVING_DIR/benchmark_serving.py \
+--model=$MODEL --backend=vllm --base-url=http://$server_name:$PORT \
+--dataset-name=random \
+--random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
+--num-prompts=$(( $CONC * 10 )) \
+--max-concurrency=$CONC \
+--request-rate=inf --ignore-eos \
+--save-result --percentile-metrics="ttft,tpot,itl,e2el" \
+--result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json
diff --git a/benchmarks/dsr1_fp8_mi300x_slurm.sh b/benchmarks/dsr1_fp8_mi300x_slurm.sh
index 90babeaee..31fe1bf55 100644
--- a/benchmarks/dsr1_fp8_mi300x_slurm.sh
+++ b/benchmarks/dsr1_fp8_mi300x_slurm.sh
@@ -47,17 +47,19 @@ python3 -m sglang.launch_server \
 --disable-radix-cache \
 > $SERVER_LOG 2>&1 &
 
+# Show logs until server is ready
+tail -f $SERVER_LOG &
+TAIL_PID=$!
 set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"The server is fired up and ready to roll"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
+until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
+    sleep 5
+done
+kill $TAIL_PID
 
 set -x
-git clone https://github.com/kimbochen/bench_serving.git
-python3 bench_serving/benchmark_serving.py \
+BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
+git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
+python3 $BENCH_SERVING_DIR/benchmark_serving.py \
 --model=$MODEL --backend=vllm \
 --base-url="http://0.0.0.0:$PORT" \
 --dataset-name=random \
diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh
index 530e61373..ac9aefefe 100644
--- a/benchmarks/gptoss_fp4_b200_docker.sh
+++ b/benchmarks/gptoss_fp4_b200_docker.sh
@@ -43,6 +43,7 @@ export TORCH_CUDA_ARCH_LIST="10.0"
 export VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB='{"2":32,"4":32,"8":8}'
 export PYTHONNOUSERSITE=1
 export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1
+
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 
 set -x
@@ -66,7 +67,7 @@ set -x
 BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
 git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
 python3 $BENCH_SERVING_DIR/benchmark_serving.py \
---model $MODEL  --backend vllm --base-url http://localhost:$PORT \
+--model $MODEL --backend vllm --base-url http://localhost:$PORT \
 --dataset-name random \
 --random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
 --num-prompts $NUM_PROMPTS \
diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh
index 66a8642bd..32efdf0fe 100644
--- a/benchmarks/gptoss_fp4_mi300x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi300x_docker.sh
@@ -24,6 +24,8 @@ export VLLM_ROCM_USE_AITER_MHA=0
 export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 
 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
 
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+
 set -x
 vllm serve $MODEL --port $PORT \
 --tensor-parallel-size=$TP \
@@ -34,4 +36,26 @@ vllm serve $MODEL --port $PORT \
 --block-size=64 \
 --no-enable-prefix-caching \
 --disable-log-requests \
---async-scheduling
+--async-scheduling > $SERVER_LOG 2>&1 &
+
+# Show logs until server is ready
+tail -f $SERVER_LOG &
+TAIL_PID=$!
+set +x
+until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
+    sleep 5
+done
+kill $TAIL_PID
+
+set -x
+BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
+git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
+python3 $BENCH_SERVING_DIR/benchmark_serving.py \
+--model=$MODEL --backend=vllm --base-url=http://$server_name:$PORT \
+--dataset-name=random \
+--random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
+--num-prompts=$(( $CONC * 10 )) \
+--max-concurrency=$CONC \
+--request-rate=inf --ignore-eos \
+--save-result --percentile-metrics="ttft,tpot,itl,e2el" \
+--result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json
diff --git a/benchmarks/gptoss_fp4_mi300x_slurm.sh b/benchmarks/gptoss_fp4_mi300x_slurm.sh
index 0ab5a250f..0e4a0b3b2 100644
--- a/benchmarks/gptoss_fp4_mi300x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi300x_slurm.sh
@@ -48,17 +48,18 @@ vllm serve $MODEL --port $PORT \
 --async-scheduling \
 > $SERVER_LOG 2>&1 &
 
+# Show logs until server is ready
+tail -f $SERVER_LOG &
+TAIL_PID=$!
 set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
+until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
+    sleep 5
+done
+kill $TAIL_PID
 
-set -x
-git clone https://github.com/kimbochen/bench_serving.git
-python3 bench_serving/benchmark_serving.py \
+BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
+git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
+python3 $BENCH_SERVING_DIR/benchmark_serving.py \
 --model $MODEL --backend vllm \
 --base-url http://0.0.0.0:$PORT \
 --dataset-name random \
diff --git a/runners/launch_b200-nvd.sh b/runners/launch_b200-nvd.sh
index 47c7c979f..c5216b006 100644
--- a/runners/launch_b200-nvd.sh
+++ b/runners/launch_b200-nvd.sh
@@ -27,12 +27,12 @@ set -x
 
 if [[ "$MODEL" == "nvidia/DeepSeek-R1-0528-FP4" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then
   if [[ "$OSL" == "8192" ]]; then
-    NUM_PROMPTS=$(( CONC * 20 ))
+    export NUM_PROMPTS=$(( CONC * 20 ))
   else
-    NUM_PROMPTS=$(( CONC * 50 ))
+    export NUM_PROMPTS=$(( CONC * 50 ))
   fi
 else
-  NUM_PROMPTS=$(( CONC * 10 ))
+  export NUM_PROMPTS=$(( CONC * 10 ))
 fi
 
 docker run --rm --init --network host --name $server_name \
diff --git a/runners/launch_mi300x-amd.sh b/runners/launch_mi300x-amd.sh
index 51e059d4c..1f77a1ede 100644
--- a/runners/launch_mi300x-amd.sh
+++ b/runners/launch_mi300x-amd.sh
@@ -5,52 +5,16 @@ sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
 HF_HUB_CACHE_MOUNT="/shareddata/hf_hub_cache_$(hostname)/"
 PORT=8888
 
-network_name="bmk-net"
 server_name="bmk-server"
-client_name="bmk-client"
-
-docker network create $network_name
 
 set -x
-docker run --rm -d --ipc=host --shm-size=16g --network=$network_name --name=$server_name \
+docker run --rm --ipc=host --shm-size=16g --name=$server_name \
 --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \
 --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
 -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \
--e ISL -e OSL \
+-e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \
 --entrypoint=/bin/bash \
 $IMAGE \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi300x_docker.sh"
-
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" =~ Application\ startup\ complete ]]; then
-        break
-    fi
-done < <(docker logs -f --tail=0 $server_name 2>&1)
-
-git clone https://github.com/kimbochen/bench_serving.git
-
-set -x
-docker run --rm --network=$network_name --name=$client_name \
--v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
--e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \
---entrypoint=python3 \
-$IMAGE \
-bench_serving/benchmark_serving.py \
---model=$MODEL --backend=vllm --base-url=http://$server_name:$PORT \
---dataset-name=random \
---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
---num-prompts=$(( $CONC * 10 )) \
---max-concurrency=$CONC \
---request-rate=inf --ignore-eos \
---save-result --percentile-metrics="ttft,tpot,itl,e2el" \
---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json
-
-while [ -n "$(docker ps -aq)" ]; do
-    docker stop $server_name
-    docker network rm $network_name
-    sleep 5
-done

From 813381b9616173df45f11f4235b774e586a53d75 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 13 Nov 2025 18:22:42 -0600
Subject: [PATCH 028/214] updating mi300 pt 2

---
 runners/launch_mi300x-amd.sh |  2 +-
 runners/launch_mi300x-cr.sh  | 40 ++----------------------------------
 runners/launch_mi300x-oci.sh | 40 ++----------------------------------
 3 files changed, 5 insertions(+), 77 deletions(-)

diff --git a/runners/launch_mi300x-amd.sh b/runners/launch_mi300x-amd.sh
index 1f77a1ede..780e5a2f0 100644
--- a/runners/launch_mi300x-amd.sh
+++ b/runners/launch_mi300x-amd.sh
@@ -8,7 +8,7 @@ PORT=8888
 server_name="bmk-server"
 
 set -x
-docker run --rm --ipc=host --shm-size=16g --name=$server_name \
+docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
 --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \
 --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
 -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
diff --git a/runners/launch_mi300x-cr.sh b/runners/launch_mi300x-cr.sh
index 48be17610..bdcc9e422 100644
--- a/runners/launch_mi300x-cr.sh
+++ b/runners/launch_mi300x-cr.sh
@@ -5,52 +5,16 @@ sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
 HF_HUB_CACHE_MOUNT="/mnt/vdb/gha_cache/hf_hub_cache/"
 PORT=8888
 
-network_name="bmk-net"
 server_name="bmk-server"
-client_name="bmk-client"
-
-docker network create $network_name
 
 set -x
-docker run --rm -d --ipc=host --shm-size=16g --network=$network_name --name=$server_name \
+docker run --rm -d --ipc=host --shm-size=16g --network=host --name=$server_name \
 --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \
 --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
 -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \
--e ISL -e OSL \
+-e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \
 --entrypoint=/bin/bash \
 $IMAGE \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi300x_docker.sh"
-
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" =~ Application\ startup\ complete ]]; then
-        break
-    fi
-done < <(docker logs -f --tail=0 $server_name 2>&1)
-
-git clone https://github.com/kimbochen/bench_serving.git
-
-set -x
-docker run --rm --network=$network_name --name=$client_name \
--v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
--e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \
---entrypoint=python3 \
-$IMAGE \
-bench_serving/benchmark_serving.py \
---model=$MODEL --backend=vllm --base-url=http://$server_name:$PORT \
---dataset-name=random \
---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
---num-prompts=$(( $CONC * 10 )) \
---max-concurrency=$CONC \
---request-rate=inf --ignore-eos \
---save-result --percentile-metrics="ttft,tpot,itl,e2el" \
---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json
-
-while [ -n "$(docker ps -aq)" ]; do
-    docker stop $server_name
-    docker network rm $network_name
-    sleep 5
-done
diff --git a/runners/launch_mi300x-oci.sh b/runners/launch_mi300x-oci.sh
index 60cf9c238..2018cbc94 100644
--- a/runners/launch_mi300x-oci.sh
+++ b/runners/launch_mi300x-oci.sh
@@ -3,52 +3,16 @@
 HF_HUB_CACHE_MOUNT="$HOME/hf_hub_cache/"
 PORT=8888
 
-network_name="bmk-net"
 server_name="bmk-server"
-client_name="bmk-client"
-
-docker network create $network_name
 
 set -x
-docker run --rm -d --ipc=host --shm-size=16g --network=$network_name --name=$server_name \
+docker run --rm -d --ipc=host --shm-size=16g --network=host --name=$server_name \
 --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \
 --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
 -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \
--e ISL -e OSL \
+-e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \
 --entrypoint=/bin/bash \
 $IMAGE \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi300x_docker.sh"
-
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" =~ Application\ startup\ complete ]]; then
-        break
-    fi
-done < <(docker logs -f --tail=0 $server_name 2>&1)
-
-git clone https://github.com/kimbochen/bench_serving.git
-
-set -x
-docker run --rm --network=$network_name --name=$client_name \
--v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
--e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \
---entrypoint=python3 \
-$IMAGE \
-bench_serving/benchmark_serving.py \
---model=$MODEL --backend=vllm --base-url=http://$server_name:$PORT \
---dataset-name=random \
---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
---num-prompts=$(( $CONC * 10 )) \
---max-concurrency=$CONC \
---request-rate=inf --ignore-eos \
---save-result --percentile-metrics="ttft,tpot,itl,e2el" \
---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json
-
-while [ -n "$(docker ps -aq)" ]; do
-    docker stop $server_name
-    docker network rm $network_name
-    sleep 5
-done

From e1b387c4f8aa8d92f554cde5a03ca1e6b8282693 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 13 Nov 2025 18:24:03 -0600
Subject: [PATCH 029/214] updating mi300 pt 3 -- remove detached mode

---
 runners/launch_mi300x-cr.sh  | 2 +-
 runners/launch_mi300x-oci.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/runners/launch_mi300x-cr.sh b/runners/launch_mi300x-cr.sh
index bdcc9e422..8fbdaee63 100644
--- a/runners/launch_mi300x-cr.sh
+++ b/runners/launch_mi300x-cr.sh
@@ -8,7 +8,7 @@ PORT=8888
 server_name="bmk-server"
 
 set -x
-docker run --rm -d --ipc=host --shm-size=16g --network=host --name=$server_name \
+docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
 --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \
 --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
 -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
diff --git a/runners/launch_mi300x-oci.sh b/runners/launch_mi300x-oci.sh
index 2018cbc94..33614a03c 100644
--- a/runners/launch_mi300x-oci.sh
+++ b/runners/launch_mi300x-oci.sh
@@ -6,7 +6,7 @@ PORT=8888
 server_name="bmk-server"
 
 set -x
-docker run --rm -d --ipc=host --shm-size=16g --network=host --name=$server_name \
+docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
 --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \
 --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
 -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \

From c0a5c62b51a58eeea87c8ba59b23e69c7bb31611 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 13 Nov 2025 18:27:48 -0600
Subject: [PATCH 030/214] cleaning up mi355x

---
 benchmarks/gptoss_fp4_mi355x_docker.sh | 14 +++++---
 runners/launch_mi355x-amd.sh           | 47 --------------------------
 2 files changed, 9 insertions(+), 52 deletions(-)

diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh
index 533f5e212..8209857bd 100644
--- a/benchmarks/gptoss_fp4_mi355x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi355x_docker.sh
@@ -22,6 +22,8 @@ export VLLM_USE_AITER_UNIFIED_ATTENTION=1
 export VLLM_ROCM_USE_AITER_MHA=0
 export VLLM_ROCM_USE_AITER_FUSED_MOE_A16W4=1
 
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+
 set -x
 vllm serve $MODEL --port $PORT \
 --tensor-parallel-size=$TP \
@@ -32,18 +34,20 @@ vllm serve $MODEL --port $PORT \
 --block-size=64 \
 --no-enable-prefix-caching \
 --disable-log-requests \
---async-scheduling | tee $(mktemp /tmp/server-XXXXXX.log) &
+--async-scheduling > $SERVER_LOG 2>&1 &
 
-# Show server logs til' it is up, then stop showing
+# Show logs until server is ready
+tail -f $SERVER_LOG &
+TAIL_PID=$!
 set +x
-until curl --output /dev/null --silent --fail http://localhost:$PORT/health; do
+until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
     sleep 5
 done
-pkill -P $$ tee 2>/dev/null
+kill $TAIL_PID
 
+set -x
 BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
 git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
-set -x
 python3 $BENCH_SERVING_DIR/benchmark_serving.py \
 --model=$MODEL --backend=vllm --base-url="http://localhost:$PORT" \
 --dataset-name=random \
diff --git a/runners/launch_mi355x-amd.sh b/runners/launch_mi355x-amd.sh
index 009a53108..e77daf5c2 100644
--- a/runners/launch_mi355x-amd.sh
+++ b/runners/launch_mi355x-amd.sh
@@ -17,11 +17,7 @@
 HF_HUB_CACHE_MOUNT="/nfsdata/hf_hub_cache-1/"  # Temp solution
 PORT=8888
 
-# network_name="bmk-net"
 server_name="bmk-server"
-# client_name="bmk-client"
-
-# docker network create $network_name
 
 set -x
 docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
@@ -35,50 +31,7 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
 $IMAGE \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi355x_docker.sh"
 
-# set +x
-# while IFS= read -r line; do
-#     printf '%s\n' "$line"
-#     if [[ "$line" =~ Application\ startup\ complete ]]; then
-#         break
-#     fi
-# done < <(docker logs -f --tail=0 $server_name 2>&1)
-
-# if [[ "$MODEL" == "amd/DeepSeek-R1-0528-MXFP4-Preview" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then
-#   if [[ "$OSL" == "8192" ]]; then
-#     NUM_PROMPTS=$(( CONC * 20 ))
-#   else
-#     NUM_PROMPTS=$(( CONC * 50 ))
-#   fi
-# else
-#   NUM_PROMPTS=$(( CONC * 10 ))
-# fi
-
-# git clone https://github.com/kimbochen/bench_serving.git
-
-# set -x
-# docker run --rm --network=$network_name --name=$client_name \
-# -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
-# -e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \
-# --entrypoint=python3 \
-# $IMAGE \
-# bench_serving/benchmark_serving.py \
-# --model=$MODEL --backend=vllm --base-url="http://$server_name:$PORT" \
-# --dataset-name=random \
-# --random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
-# --num-prompts=$NUM_PROMPTS \
-# --max-concurrency=$CONC \
-# --request-rate=inf --ignore-eos \
-# --save-result --percentile-metrics="ttft,tpot,itl,e2el" \
-# --result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json
-
 if ls gpucore.* 1> /dev/null 2>&1; then
   echo "gpucore files exist. not good"
   rm -f gpucore.*
 fi
-
-
-# while [ -n "$(docker ps -aq)" ]; do
-#     docker stop $server_name
-#     # docker network rm $network_name
-#     sleep 5
-# done

From 634768cd538b1f55899d714ec5f1cf670ff74d4a Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 13 Nov 2025 18:33:16 -0600
Subject: [PATCH 031/214] fixing mi300x and updating 325x

---
 benchmarks/dsr1_fp8_mi325x_docker.sh   | 26 ++++++++++++++++-
 benchmarks/dsr1_fp8_mi325x_slurm.sh    | 18 ++++++------
 benchmarks/gptoss_fp4_mi300x_docker.sh |  4 +--
 benchmarks/gptoss_fp4_mi325x_docker.sh | 26 ++++++++++++++++-
 benchmarks/gptoss_fp4_mi325x_slurm.sh  | 18 ++++++------
 runners/launch_mi325x-amd.sh           | 40 ++------------------------
 6 files changed, 74 insertions(+), 58 deletions(-)

diff --git a/benchmarks/dsr1_fp8_mi325x_docker.sh b/benchmarks/dsr1_fp8_mi325x_docker.sh
index f39a8dbbd..41f77ebd3 100644
--- a/benchmarks/dsr1_fp8_mi325x_docker.sh
+++ b/benchmarks/dsr1_fp8_mi325x_docker.sh
@@ -14,6 +14,8 @@
 
 export SGLANG_USE_AITER=1
 
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+
 python3 -m sglang.launch_server \
     --model-path $MODEL \
     --host=0.0.0.0 \
@@ -24,5 +26,27 @@ python3 -m sglang.launch_server \
     --mem-fraction-static 0.8 --disable-radix-cache \
     --num-continuous-decode-steps 4 \
     --max-prefill-tokens 196608 \
-    --cuda-graph-max-bs 128
+    --cuda-graph-max-bs 128 > $SERVER_LOG 2>&1 &
+
+# Show logs until server is ready
+tail -f $SERVER_LOG &
+TAIL_PID=$!
+set +x
+until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
+    sleep 5
+done
+kill $TAIL_PID
+
+set -x
+BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
+git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
+python3 $BENCH_SERVING_DIR/benchmark_serving.py \
+--model=$MODEL --backend=vllm --base-url=http://0.0.0.0:$PORT \
+--dataset-name=random \
+--random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
+--num-prompts=$(( $CONC * 10 )) \
+--max-concurrency=$CONC \
+--request-rate=inf --ignore-eos \
+--save-result --percentile-metrics="ttft,tpot,itl,e2el" \
+--result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json
     
diff --git a/benchmarks/dsr1_fp8_mi325x_slurm.sh b/benchmarks/dsr1_fp8_mi325x_slurm.sh
index 09dae4dbb..f9da69095 100644
--- a/benchmarks/dsr1_fp8_mi325x_slurm.sh
+++ b/benchmarks/dsr1_fp8_mi325x_slurm.sh
@@ -23,17 +23,19 @@ python3 -m sglang.launch_server \
 --disable-radix-cache \
 > $SERVER_LOG 2>&1 &
 
+# Show logs until server is ready
+tail -f $SERVER_LOG &
+TAIL_PID=$!
 set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"The server is fired up and ready to roll"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
+until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
+    sleep 5
+done
+kill $TAIL_PID
 
 set -x
-git clone https://github.com/kimbochen/bench_serving.git
-python3 bench_serving/benchmark_serving.py \
+BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
+git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
+python3 $BENCH_SERVING_DIR/benchmark_serving.py \
 --model $MODEL --backend vllm \
 --base-url http://0.0.0.0:$PORT \
 --dataset-name random \
diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh
index 32efdf0fe..0b03900be 100644
--- a/benchmarks/gptoss_fp4_mi300x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi300x_docker.sh
@@ -51,11 +51,11 @@ set -x
 BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
 git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
 python3 $BENCH_SERVING_DIR/benchmark_serving.py \
---model=$MODEL --backend=vllm --base-url=http://$server_name:$PORT \
+--model=$MODEL --backend=vllm --base-url=http://0.0.0.0:$PORT \
 --dataset-name=random \
 --random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
 --num-prompts=$(( $CONC * 10 )) \
 --max-concurrency=$CONC \
 --request-rate=inf --ignore-eos \
 --save-result --percentile-metrics="ttft,tpot,itl,e2el" \
---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json
+--result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_mi325x_docker.sh b/benchmarks/gptoss_fp4_mi325x_docker.sh
index 05250267f..c57446da3 100644
--- a/benchmarks/gptoss_fp4_mi325x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi325x_docker.sh
@@ -23,6 +23,8 @@ export VLLM_USE_AITER_UNIFIED_ATTENTION=1
 export VLLM_ROCM_USE_AITER_MHA=0
 export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0
 
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+
 set -x
 vllm serve $MODEL --port $PORT \
 --tensor-parallel-size=$TP \
@@ -33,4 +35,26 @@ vllm serve $MODEL --port $PORT \
 --block-size=64 \
 --no-enable-prefix-caching \
 --disable-log-requests \
---async-scheduling
+--async-scheduling > $SERVER_LOG 2>&1 &
+
+# Show logs until server is ready
+tail -f $SERVER_LOG &
+TAIL_PID=$!
+set +x
+until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
+    sleep 5
+done
+kill $TAIL_PID
+
+set -x
+BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
+git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
+python3 $BENCH_SERVING_DIR/benchmark_serving.py \
+--model=$MODEL --backend=vllm --base-url=http://0.0.0.0:$PORT \
+--dataset-name=random \
+--random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
+--num-prompts=$(( $CONC * 10 )) \
+--max-concurrency=$CONC \
+--request-rate=inf --ignore-eos \
+--save-result --percentile-metrics="ttft,tpot,itl,e2el" \
+--result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json
diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh
index cab549cbc..9cbef3276 100644
--- a/benchmarks/gptoss_fp4_mi325x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh
@@ -48,17 +48,19 @@ vllm serve $MODEL --port $PORT \
 --async-scheduling \
 > $SERVER_LOG 2>&1 &
 
+# Show logs until server is ready
+tail -f $SERVER_LOG &
+TAIL_PID=$!
 set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
+until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
+    sleep 5
+done
+kill $TAIL_PID
 
 set -x
-git clone https://github.com/kimbochen/bench_serving.git
-python3 bench_serving/benchmark_serving.py \
+BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
+git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
+python3 $BENCH_SERVING_DIR/benchmark_serving.py \
 --model $MODEL --backend vllm \
 --base-url http://0.0.0.0:$PORT \
 --dataset-name random \
diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh
index 91b9bfad3..4dd66bc17 100644
--- a/runners/launch_mi325x-amd.sh
+++ b/runners/launch_mi325x-amd.sh
@@ -5,52 +5,16 @@ sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
 HF_HUB_CACHE_MOUNT="/home/kimbosemianalysis/hf_hub_cache/"
 PORT=8888
 
-network_name="bmk-net"
 server_name="bmk-server"
-client_name="bmk-client"
-
-docker network create $network_name
 
 set -x
-docker run --rm -d --ipc=host --shm-size=16g --network=$network_name --name=$server_name \
+docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
 --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \
 --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
 -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \
--e ISL -e OSL \
+-e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \
 --entrypoint=/bin/bash \
 $IMAGE \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi325x_docker.sh"
-
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" =~ Application\ startup\ complete ]]; then
-        break
-    fi
-done < <(docker logs -f --tail=0 $server_name 2>&1)
-
-git clone https://github.com/kimbochen/bench_serving.git
-
-set -x
-docker run --rm --network=$network_name --name=$client_name \
--v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
--e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \
---entrypoint=python3 \
-$IMAGE \
-bench_serving/benchmark_serving.py \
---model=$MODEL --backend=vllm --base-url=http://$server_name:$PORT \
---dataset-name=random \
---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
---num-prompts=$(( $CONC * 10 )) \
---max-concurrency=$CONC \
---request-rate=inf --ignore-eos \
---save-result --percentile-metrics="ttft,tpot,itl,e2el" \
---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json
-
-while [ -n "$(docker ps -aq)" ]; do
-    docker stop $server_name
-    docker network rm $network_name
-    sleep 5
-done

From 61a5c8f1415d957ab0251c23e90f1b572da05408 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 13 Nov 2025 18:38:43 -0600
Subject: [PATCH 032/214] reverting max conc to 512 on gptoss fp4 b200 docker

---
 benchmarks/gptoss_fp4_h100_docker.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh
index ae889474c..2cec8a165 100644
--- a/benchmarks/gptoss_fp4_h100_docker.sh
+++ b/benchmarks/gptoss_fp4_h100_docker.sh
@@ -51,7 +51,7 @@ python3 $BENCH_SERVING_DIR/benchmark_serving.py \
 --base-url=http://localhost:$PORT \
 --dataset-name=random \
 --random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
---num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \
+--num-prompts=$(( $CONC * 10 )) --max-concurrency=512 \
 --request-rate=inf --ignore-eos \
 --save-result --percentile-metrics='ttft,tpot,itl,e2el' \
 --result-dir=/workspace/ \

From 74363e41efe22dc29389fc7952e75eea9fe5b5ed Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 13 Nov 2025 18:43:58 -0600
Subject: [PATCH 033/214] mi325x debug

---
 runners/launch_mi325x-amd.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh
index 4dd66bc17..481603aa8 100644
--- a/runners/launch_mi325x-amd.sh
+++ b/runners/launch_mi325x-amd.sh
@@ -1,5 +1,7 @@
 #!/usr/bin/bash
 
+set -x 
+
 sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
 
 HF_HUB_CACHE_MOUNT="/home/kimbosemianalysis/hf_hub_cache/"
@@ -17,4 +19,4 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
 -e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \
 --entrypoint=/bin/bash \
 $IMAGE \
-benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi325x_docker.sh"
+benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi325x_docker.sh"
\ No newline at end of file

From 220e0261a4f984dff0ab543b79976a7981a9f47e Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 13 Nov 2025 19:05:38 -0600
Subject: [PATCH 034/214] add back correct launch script for new mi325x slurm
 cluster (#231)

---
 runners/launch_mi325x-amd.sh | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh
index 481603aa8..b622ee2e8 100644
--- a/runners/launch_mi325x-amd.sh
+++ b/runners/launch_mi325x-amd.sh
@@ -1,22 +1,24 @@
-#!/usr/bin/bash
+#!/usr/bin/env bash
 
-set -x 
+export HF_HUB_CACHE_MOUNT="/nfsdata/sa/hf_hub_cache-${USER: -1}/"
+export PORT_OFFSET=${USER: -1}
 
-sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+PARTITION="compute"
+SQUASH_FILE="/nfsdata/sa/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
 
-HF_HUB_CACHE_MOUNT="/home/kimbosemianalysis/hf_hub_cache/"
-PORT=8888
+set -x
+salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=256 --time=180 --no-shell
+JOB_ID=$(squeue -u $USER -h -o %A | head -n1)
 
-server_name="bmk-server"
+srun --jobid=$JOB_ID bash -c "sudo enroot import -o $SQUASH_FILE docker://$IMAGE"
+srun --jobid=$JOB_ID \
+--container-image=$SQUASH_FILE \
+--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
+--container-mount-home \
+--container-writable \
+--container-remap-root \
+--container-workdir=/workspace/ \
+--no-container-entrypoint --export=ALL \
+bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_mi325x_slurm.sh
 
-set -x
-docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
---privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \
---cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
--v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
--v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
--e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \
--e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \
---entrypoint=/bin/bash \
-$IMAGE \
-benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi325x_docker.sh"
\ No newline at end of file
+scancel $JOB_ID
\ No newline at end of file

From 5db1af845ac396a15de9faefd11794d1165330db Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 13 Nov 2025 18:33:16 -0600
Subject: [PATCH 035/214] fixing mi300x and updating 325x

---
 runners/launch_mi325x-amd.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh
index b622ee2e8..1065167d7 100644
--- a/runners/launch_mi325x-amd.sh
+++ b/runners/launch_mi325x-amd.sh
@@ -21,4 +21,4 @@ srun --jobid=$JOB_ID \
 --no-container-entrypoint --export=ALL \
 bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_mi325x_slurm.sh
 
-scancel $JOB_ID
\ No newline at end of file
+scancel $JOB_ID

From b4eb57ee9d424141c8557d0494116de6ae6027da Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 14 Nov 2025 09:04:15 -0600
Subject: [PATCH 036/214] cleanng up

---
 benchmarks/dsr1_fp4_mi355x_docker.sh    | 27 ++++++++++++++++++++++++-
 benchmarks/dsr1_fp4_mi355x_slurm.sh     | 13 +++---------
 benchmarks/dsr1_fp8_h200_slurm.sh       | 13 +++---------
 benchmarks/dsr1_fp8_mi355x_docker.sh    | 13 ++++++++----
 benchmarks/dsr1_fp8_mi355x_slurm.sh     | 18 +++++++++--------
 benchmarks/gptoss_fp4_b200_trt_slurm.sh | 20 +++++++++---------
 benchmarks/gptoss_fp4_h200_slurm.sh     | 18 +++++++++--------
 7 files changed, 72 insertions(+), 50 deletions(-)

diff --git a/benchmarks/dsr1_fp4_mi355x_docker.sh b/benchmarks/dsr1_fp4_mi355x_docker.sh
index 4d3ed084c..72c4e4778 100644
--- a/benchmarks/dsr1_fp4_mi355x_docker.sh
+++ b/benchmarks/dsr1_fp4_mi355x_docker.sh
@@ -18,6 +18,8 @@ if [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
 	fi
 fi
 
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+
 set -x
 python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \
 --host=0.0.0.0 --port=$PORT \
@@ -27,5 +29,28 @@ python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \
 --disable-radix-cache \
 --num-continuous-decode-steps=4 \
 --max-prefill-tokens=$PREFILL_SIZE \
---cuda-graph-max-bs=128
+--cuda-graph-max-bs=128 > $SERVER_LOG 2>&1 &
+
+# Show logs until server is ready
+tail -f $SERVER_LOG &
+TAIL_PID=$!
+set +x
+until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
+    sleep 5
+done
+kill $TAIL_PID
+
+set -x
+BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
+git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
+python3 $BENCH_SERVING_DIR/benchmark_serving.py \
+--model=$MODEL --backend=vllm --base-url="http://localhost:$PORT" \
+--dataset-name=random \
+--random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
+--num-prompts=$NUM_PROMPTS \
+--max-concurrency=$CONC \
+--request-rate=inf --ignore-eos \
+--save-result --percentile-metrics="ttft,tpot,itl,e2el" \
+--result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json
+
 
diff --git a/benchmarks/dsr1_fp4_mi355x_slurm.sh b/benchmarks/dsr1_fp4_mi355x_slurm.sh
index b88a90f46..ffd2883fd 100644
--- a/benchmarks/dsr1_fp4_mi355x_slurm.sh
+++ b/benchmarks/dsr1_fp4_mi355x_slurm.sh
@@ -34,17 +34,10 @@ python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \
 --cuda-graph-max-bs=128 \
 > $SERVER_LOG 2>&1 &
 
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"The server is fired up and ready to roll"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
-
 set -x
-git clone https://github.com/kimbochen/bench_serving.git
-python3 bench_serving/benchmark_serving.py \
+BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
+git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
+python3 $BENCH_SERVING_DIR/benchmark_serving.py \
 --model $MODEL --backend vllm \
 --base-url "http://0.0.0.0:$PORT" \
 --dataset-name random \
diff --git a/benchmarks/dsr1_fp8_h200_slurm.sh b/benchmarks/dsr1_fp8_h200_slurm.sh
index 86ea0024f..2298b5486 100644
--- a/benchmarks/dsr1_fp8_h200_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_slurm.sh
@@ -44,17 +44,10 @@ else
     > $SERVER_LOG 2>&1 &
 fi
 
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
-
 set -x
-git clone https://github.com/kimbochen/bench_serving.git 
-python3 bench_serving/benchmark_serving.py \
+BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
+git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
+python3 $BENCH_SERVING_DIR/benchmark_serving.py \
 --model $MODEL --backend vllm \
 --base-url http://0.0.0.0:$PORT \
 --dataset-name random \
diff --git a/benchmarks/dsr1_fp8_mi355x_docker.sh b/benchmarks/dsr1_fp8_mi355x_docker.sh
index baad70fd8..2ee734495 100644
--- a/benchmarks/dsr1_fp8_mi355x_docker.sh
+++ b/benchmarks/dsr1_fp8_mi355x_docker.sh
@@ -14,6 +14,8 @@
 
 export SGLANG_USE_AITER=1
 
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+
 python3 -m sglang.launch_server \
     --model-path $MODEL \
     --host=0.0.0.0 \
@@ -24,13 +26,16 @@ python3 -m sglang.launch_server \
     --mem-fraction-static 0.8 --disable-radix-cache \
     --num-continuous-decode-steps 4 \
     --max-prefill-tokens 196608 \
-    --cuda-graph-max-bs 128 | tee $(mktemp /tmp/server-XXXXXX.log) &
+    --cuda-graph-max-bs 128 > $SERVER_LOG 2>&1 &
 
+# Show logs until server is ready
+tail -f $SERVER_LOG &
+TAIL_PID=$!
 set +x
-until curl --output /dev/null --silent --fail http://localhost:$PORT/health; do
+until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
     sleep 5
 done
-pkill -P $$ tee 2>/dev/null
+kill $TAIL_PID
 
 if [[ "$MODEL" == "amd/DeepSeek-R1-0528-MXFP4-Preview" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then
   if [[ "$OSL" == "8192" ]]; then
@@ -42,9 +47,9 @@ else
   NUM_PROMPTS=$(( CONC * 10 ))
 fi
 
+set -x
 BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
 git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
-set -x
 python3 $BENCH_SERVING_DIR/benchmark_serving.py \
 --model=$MODEL --backend=vllm --base-url="http://localhost:$PORT" \
 --dataset-name=random \
diff --git a/benchmarks/dsr1_fp8_mi355x_slurm.sh b/benchmarks/dsr1_fp8_mi355x_slurm.sh
index bf5d60e9c..0bdc36024 100644
--- a/benchmarks/dsr1_fp8_mi355x_slurm.sh
+++ b/benchmarks/dsr1_fp8_mi355x_slurm.sh
@@ -32,17 +32,19 @@ python3 -m sglang.launch_server \
     --max-prefill-tokens 196608 \
     --cuda-graph-max-bs 128 > $SERVER_LOG 2>&1 &
 
+# Show logs until server is ready
+tail -f $SERVER_LOG &
+TAIL_PID=$!
 set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"The server is fired up and ready to roll"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
+until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
+    sleep 5
+done
+kill $TAIL_PID
 
 set -x
-git clone https://github.com/kimbochen/bench_serving.git
-python3 bench_serving/benchmark_serving.py \
+BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
+git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
+python3 $BENCH_SERVING_DIR/benchmark_serving.py \
 --model $MODEL --backend vllm \
 --base-url "http://0.0.0.0:$PORT" \
 --dataset-name random \
diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh
index 349930dfb..92477dd56 100644
--- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh
+++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh
@@ -79,17 +79,19 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \
     > $SERVER_LOG 2>&1 &
 
 
+# Show logs until server is ready
+tail -f $SERVER_LOG &
+TAIL_PID=$!
 set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
-
-git clone https://github.com/kimbochen/bench_serving.git
+until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
+    sleep 5
+done
+kill $TAIL_PID
+
 set -x
-python3 bench_serving/benchmark_serving.py \
+BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
+git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
+python3 $BENCH_SERVING_DIR/benchmark_serving.py \
 --model $MODEL --backend openai \
 --base-url http://0.0.0.0:$PORT \
 --dataset-name random \
diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh
index f92c60425..37851d39c 100644
--- a/benchmarks/gptoss_fp4_h200_slurm.sh
+++ b/benchmarks/gptoss_fp4_h200_slurm.sh
@@ -48,17 +48,19 @@ PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config
  --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC  \
  --disable-log-requests > $SERVER_LOG 2>&1 &
 
+# Show logs until server is ready
+tail -f $SERVER_LOG &
+TAIL_PID=$!
 set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
+until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
+    sleep 5
+done
+kill $TAIL_PID
 
 set -x
-git clone https://github.com/kimbochen/bench_serving.git
-python3 bench_serving/benchmark_serving.py \
+BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
+git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
+python3 $BENCH_SERVING_DIR/benchmark_serving.py \
 --model $MODEL --backend vllm \
 --base-url http://0.0.0.0:$PORT \
 --dataset-name random \

From 04e30f350bc5df5e7f556dea612ba3cd7e464df8 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 14 Nov 2025 09:09:20 -0600
Subject: [PATCH 037/214] add wait for h200 slurm dsr1

---
 benchmarks/dsr1_fp8_h200_slurm.sh | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/benchmarks/dsr1_fp8_h200_slurm.sh b/benchmarks/dsr1_fp8_h200_slurm.sh
index 2298b5486..14e3c2a7b 100644
--- a/benchmarks/dsr1_fp8_h200_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_slurm.sh
@@ -44,6 +44,15 @@ else
     > $SERVER_LOG 2>&1 &
 fi
 
+# Show logs until server is ready
+tail -f $SERVER_LOG &
+TAIL_PID=$!
+set +x
+until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
+    sleep 5
+done
+kill $TAIL_PID
+
 set -x
 BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
 git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR

From d36965abc3d33099c320f13fc39221326a13b961 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 14 Nov 2025 09:20:27 -0600
Subject: [PATCH 038/214] max num seqs back to 512 for gptoss fpr b200 docker

---
 benchmarks/gptoss_fp4_b200_docker.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh
index ac9aefefe..878564e11 100644
--- a/benchmarks/gptoss_fp4_b200_docker.sh
+++ b/benchmarks/gptoss_fp4_b200_docker.sh
@@ -48,7 +48,7 @@ SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 
 set -x
 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \
---gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC \
+--gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs 512 \
 --disable-log-requests > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!

From fa7cbca344a604f6b578a918609582edfc29eb04 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 14 Nov 2025 09:40:10 -0600
Subject: [PATCH 039/214] fix port issue for dsr1 mi300x docker

---
 benchmarks/dsr1_fp8_mi300x_docker.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/dsr1_fp8_mi300x_docker.sh b/benchmarks/dsr1_fp8_mi300x_docker.sh
index 82cb4fbee..033b84ed0 100644
--- a/benchmarks/dsr1_fp8_mi300x_docker.sh
+++ b/benchmarks/dsr1_fp8_mi300x_docker.sh
@@ -51,7 +51,7 @@ set -x
 BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
 git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
 python3 $BENCH_SERVING_DIR/benchmark_serving.py \
---model=$MODEL --backend=vllm --base-url=http://$server_name:$PORT \
+--model=$MODEL --backend=vllm --base-url=http://0.0.0.0:$PORT \
 --dataset-name=random \
 --random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
 --num-prompts=$(( $CONC * 10 )) \

From 1031ac957ec607bb586bceda5179ab2145e6836f Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 14 Nov 2025 11:50:53 -0600
Subject: [PATCH 040/214] fix mi355x docker NUM_PROMPTS

---
 runners/launch_mi355x-amd.sh | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/runners/launch_mi355x-amd.sh b/runners/launch_mi355x-amd.sh
index e77daf5c2..5f3cbb290 100644
--- a/runners/launch_mi355x-amd.sh
+++ b/runners/launch_mi355x-amd.sh
@@ -19,13 +19,23 @@ PORT=8888
 
 server_name="bmk-server"
 
+if [[ "$MODEL" == "amd/DeepSeek-R1-0528-MXFP4-Preview" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then
+  if [[ "$OSL" == "8192" ]]; then
+    export NUM_PROMPTS=$(( CONC * 20 ))
+  else
+    export NUM_PROMPTS=$(( CONC * 50 ))
+  fi
+else
+  export NUM_PROMPTS=$(( CONC * 10 ))
+fi
+
 set -x
 docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
 --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \
 --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
 -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
--e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \
+-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT -e NUM_PROMPTS \
 -e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME  \
 --entrypoint=/bin/bash \
 $IMAGE \

From 8b847f13bfb1a76689460aadaeaac61b92539e47 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 14 Nov 2025 15:46:51 -0600
Subject: [PATCH 041/214] adding prop of failure for server logs

---
 benchmarks/dsr1_fp4_mi355x_slurm.sh   | 9 +++++++++
 benchmarks/dsr1_fp8_h200_trt_slurm.sh | 2 +-
 benchmarks/gptoss_fp4_h100_docker.sh  | 6 ++++++
 runners/launch_b200-nb.sh             | 2 ++
 4 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/benchmarks/dsr1_fp4_mi355x_slurm.sh b/benchmarks/dsr1_fp4_mi355x_slurm.sh
index ffd2883fd..cad5efdc5 100644
--- a/benchmarks/dsr1_fp4_mi355x_slurm.sh
+++ b/benchmarks/dsr1_fp4_mi355x_slurm.sh
@@ -34,6 +34,15 @@ python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \
 --cuda-graph-max-bs=128 \
 > $SERVER_LOG 2>&1 &
 
+# Show logs until server is ready
+tail -f $SERVER_LOG &
+TAIL_PID=$!
+set +x
+until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
+    sleep 5
+done
+kill $TAIL_PID
+
 set -x
 BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
 git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
index a44769bc6..15647bbd2 100644
--- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
@@ -69,9 +69,9 @@ PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \
     --tp_size=$TP --ep_size=$EP_SIZE \
     --extra_llm_api_options=$EXTRA_CONFIG_FILE \
     > $SERVER_LOG 2>&1 &
+    
 SERVER_PID=$!
 
-
 # Show logs until server is ready
 tail -f $SERVER_LOG &
 TAIL_PID=$!
diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh
index 2cec8a165..6b95fae1a 100644
--- a/benchmarks/gptoss_fp4_h100_docker.sh
+++ b/benchmarks/gptoss_fp4_h100_docker.sh
@@ -32,11 +32,17 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
 --max-num-seqs=$CONC  \
 --disable-log-requests > $SERVER_LOG 2>&1 &
 
+SERVER_PID=$!
+
 # Show logs until server is ready
 tail -f $SERVER_LOG &
 TAIL_PID=$!
 set +x
 until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
+    if ! kill -0 $SERVER_PID 2>/dev/null; then
+        echo "Server died before becoming healthy. Exiting."
+        exit 1
+    fi
     sleep 5
 done
 kill $TAIL_PID
diff --git a/runners/launch_b200-nb.sh b/runners/launch_b200-nb.sh
index 9a3dfa909..ecd1466dd 100644
--- a/runners/launch_b200-nb.sh
+++ b/runners/launch_b200-nb.sh
@@ -13,3 +13,5 @@ srun --partition=$PARTITION --gres=gpu:$TP --exclusive \
 --container-workdir=/workspace/ \
 --no-container-entrypoint --export=ALL,PORT_OFFSET=${USER: -1} \
 bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_slurm.sh
+
+scancel $JOB_ID
\ No newline at end of file

From 832bafce3354dce9703c6d74084baac18b4b29c3 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 14 Nov 2025 16:05:09 -0600
Subject: [PATCH 042/214] add utils function for benchmark

---
 benchmarks/benchmark_lib.sh             | 143 ++++++++++++++++++++++++
 benchmarks/dsr1_fp4_b200_docker.sh      |  26 +++--
 benchmarks/dsr1_fp4_b200_trt_slurm.sh   |  26 +++--
 benchmarks/dsr1_fp4_mi355x_docker.sh    |  25 +++--
 benchmarks/dsr1_fp4_mi355x_slurm.sh     |  25 +++--
 benchmarks/dsr1_fp8_b200_docker.sh      |  26 +++--
 benchmarks/dsr1_fp8_b200_trt_slurm.sh   |  26 +++--
 benchmarks/dsr1_fp8_h200_slurm.sh       |  26 +++--
 benchmarks/dsr1_fp8_h200_trt_slurm.sh   |  26 +++--
 benchmarks/dsr1_fp8_mi300x_docker.sh    |  25 +++--
 benchmarks/dsr1_fp8_mi300x_slurm.sh     |  26 +++--
 benchmarks/dsr1_fp8_mi325x_docker.sh    |  25 +++--
 benchmarks/dsr1_fp8_mi325x_slurm.sh     |  26 +++--
 benchmarks/dsr1_fp8_mi355x_docker.sh    |  25 +++--
 benchmarks/dsr1_fp8_mi355x_slurm.sh     |  25 +++--
 benchmarks/gptoss_fp4_b200_docker.sh    |  26 +++--
 benchmarks/gptoss_fp4_b200_trt_slurm.sh |  26 +++--
 benchmarks/gptoss_fp4_h100_docker.sh    |  28 ++---
 benchmarks/gptoss_fp4_h100_slurm.sh     |  28 ++---
 benchmarks/gptoss_fp4_h200_slurm.sh     |  26 +++--
 benchmarks/gptoss_fp4_h200_trt_slurm.sh |  26 +++--
 benchmarks/gptoss_fp4_mi300x_docker.sh  |  25 +++--
 benchmarks/gptoss_fp4_mi300x_slurm.sh   |  26 +++--
 benchmarks/gptoss_fp4_mi325x_docker.sh  |  25 +++--
 benchmarks/gptoss_fp4_mi325x_slurm.sh   |  26 +++--
 benchmarks/gptoss_fp4_mi355x_docker.sh  |  25 +++--
 benchmarks/gptoss_fp4_mi355x_slurm.sh   |  25 +++--
 27 files changed, 512 insertions(+), 301 deletions(-)
 create mode 100644 benchmarks/benchmark_lib.sh

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
new file mode 100644
index 000000000..152b5e4b6
--- /dev/null
+++ b/benchmarks/benchmark_lib.sh
@@ -0,0 +1,143 @@
+#!/usr/bin/env bash
+
+# Shared benchmarking utilities for InferenceMAX
+
+# Run benchmark serving with standardized parameters
+# All parameters are required
+# Parameters:
+#   --model: Model name
+#   --port: Server port
+#   --backend: Backend type - 'vllm' or 'openai'
+#   --input-len: Random input sequence length
+#   --output-len: Random output sequence length
+#   --random-range-ratio: Random range ratio
+#   --num-prompts: Number of prompts
+#   --max-concurrency: Max concurrency
+#   --result-filename: Result filename without extension
+#   --result-dir: Result directory
+run_benchmark_serving() {
+    local model=""
+    local port=""
+    local backend=""
+    local input_len=""
+    local output_len=""
+    local random_range_ratio=""
+    local num_prompts=""
+    local max_concurrency=""
+    local result_filename=""
+    local result_dir=""
+
+    # Parse arguments
+    while [[ $# -gt 0 ]]; do
+        case $1 in
+            --model)
+                model="$2"
+                shift 2
+                ;;
+            --port)
+                port="$2"
+                shift 2
+                ;;
+            --backend)
+                backend="$2"
+                shift 2
+                ;;
+            --input-len)
+                input_len="$2"
+                shift 2
+                ;;
+            --output-len)
+                output_len="$2"
+                shift 2
+                ;;
+            --random-range-ratio)
+                random_range_ratio="$2"
+                shift 2
+                ;;
+            --num-prompts)
+                num_prompts="$2"
+                shift 2
+                ;;
+            --max-concurrency)
+                max_concurrency="$2"
+                shift 2
+                ;;
+            --result-filename)
+                result_filename="$2"
+                shift 2
+                ;;
+            --result-dir)
+                result_dir="$2"
+                shift 2
+                ;;
+            *)
+                echo "Unknown parameter: $1"
+                return 1
+                ;;
+        esac
+    done
+
+    # Validate all required parameters
+    if [[ -z "$model" ]]; then
+        echo "Error: --model is required"
+        return 1
+    fi
+    if [[ -z "$port" ]]; then
+        echo "Error: --port is required"
+        return 1
+    fi
+    if [[ -z "$backend" ]]; then
+        echo "Error: --backend is required"
+        return 1
+    fi
+    if [[ -z "$input_len" ]]; then
+        echo "Error: --input-len is required"
+        return 1
+    fi
+    if [[ -z "$output_len" ]]; then
+        echo "Error: --output-len is required"
+        return 1
+    fi
+    if [[ -z "$random_range_ratio" ]]; then
+        echo "Error: --random-range-ratio is required"
+        return 1
+    fi
+    if [[ -z "$num_prompts" ]]; then
+        echo "Error: --num-prompts is required"
+        return 1
+    fi
+    if [[ -z "$max_concurrency" ]]; then
+        echo "Error: --max-concurrency is required"
+        return 1
+    fi
+    if [[ -z "$result_filename" ]]; then
+        echo "Error: --result-filename is required"
+        return 1
+    fi
+    if [[ -z "$result_dir" ]]; then
+        echo "Error: --result-dir is required"
+        return 1
+    fi
+
+    # Clone benchmark serving repo
+    local BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
+    git clone https://github.com/kimbochen/bench_serving.git "$BENCH_SERVING_DIR"
+
+    # Run benchmark
+    python3 "$BENCH_SERVING_DIR/benchmark_serving.py" \
+        --model "$model" \
+        --backend "$backend" \
+        --base-url "http://0.0.0.0:$port" \
+        --dataset-name random \
+        --random-input-len "$input_len" \
+        --random-output-len "$output_len" \
+        --random-range-ratio "$random_range_ratio" \
+        --num-prompts "$num_prompts" \
+        --max-concurrency "$max_concurrency" \
+        --request-rate inf \
+        --ignore-eos \
+        --save-result \
+        --percentile-metrics 'ttft,tpot,itl,e2el' \
+        --result-dir "$result_dir" \
+        --result-filename "$result_filename.json"
+}
diff --git a/benchmarks/dsr1_fp4_b200_docker.sh b/benchmarks/dsr1_fp4_b200_docker.sh
index 8b9f116c6..317015ba8 100644
--- a/benchmarks/dsr1_fp4_b200_docker.sh
+++ b/benchmarks/dsr1_fp4_b200_docker.sh
@@ -36,16 +36,20 @@ done
 kill $TAIL_PID
 
 pip install -q datasets pandas
+
+# Source benchmark utilities
+source "$(dirname "$0")/benchmark_lib.sh"
+
 set -x
-BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
-git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
-python3 $BENCH_SERVING_DIR/benchmark_serving.py \
---model $MODEL  --backend vllm --base-url http://localhost:$PORT \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $NUM_PROMPTS \
---max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ --result-filename $RESULT_FILENAME.json
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$NUM_PROMPTS" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
 
diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
index 6896880fb..897ef8527 100644
--- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
@@ -110,16 +110,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
 done
 kill $TAIL_PID
 
+# Source benchmark utilities
+source "$(dirname "$0")/benchmark_lib.sh"
+
 set -x
-BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
-git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
-python3 $BENCH_SERVING_DIR/benchmark_serving.py \
---model $MODEL --backend openai \
---base-url http://0.0.0.0:$PORT \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend openai \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts $(( $CONC * 10 )) \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
diff --git a/benchmarks/dsr1_fp4_mi355x_docker.sh b/benchmarks/dsr1_fp4_mi355x_docker.sh
index 72c4e4778..05603ecae 100644
--- a/benchmarks/dsr1_fp4_mi355x_docker.sh
+++ b/benchmarks/dsr1_fp4_mi355x_docker.sh
@@ -40,17 +40,20 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
 done
 kill $TAIL_PID
 
+# Source benchmark utilities
+source "$(dirname "$0")/benchmark_lib.sh"
+
 set -x
-BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
-git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
-python3 $BENCH_SERVING_DIR/benchmark_serving.py \
---model=$MODEL --backend=vllm --base-url="http://localhost:$PORT" \
---dataset-name=random \
---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
---num-prompts=$NUM_PROMPTS \
---max-concurrency=$CONC \
---request-rate=inf --ignore-eos \
---save-result --percentile-metrics="ttft,tpot,itl,e2el" \
---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$NUM_PROMPTS" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
 
 
diff --git a/benchmarks/dsr1_fp4_mi355x_slurm.sh b/benchmarks/dsr1_fp4_mi355x_slurm.sh
index cad5efdc5..c47bbfb38 100644
--- a/benchmarks/dsr1_fp4_mi355x_slurm.sh
+++ b/benchmarks/dsr1_fp4_mi355x_slurm.sh
@@ -43,16 +43,19 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
 done
 kill $TAIL_PID
 
+# Source benchmark utilities
+source "$(dirname "$0")/benchmark_lib.sh"
+
 set -x
-BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
-git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
-python3 $BENCH_SERVING_DIR/benchmark_serving.py \
---model $MODEL --backend vllm \
---base-url "http://0.0.0.0:$PORT" \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics "ttft,tpot,itl,e2el" \
---result-dir /workspace/ --result-filename $RESULT_FILENAME.json
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts $(( $CONC * 10 )) \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
 
diff --git a/benchmarks/dsr1_fp8_b200_docker.sh b/benchmarks/dsr1_fp8_b200_docker.sh
index f1412264c..fa498ff3e 100644
--- a/benchmarks/dsr1_fp8_b200_docker.sh
+++ b/benchmarks/dsr1_fp8_b200_docker.sh
@@ -47,15 +47,19 @@ done
 kill $TAIL_PID
 
 pip install -q datasets pandas
+
+# Source benchmark utilities
+source "$(dirname "$0")/benchmark_lib.sh"
+
 set -x
-BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
-git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
-python3 $BENCH_SERVING_DIR/benchmark_serving.py \
---model $MODEL  --backend vllm --base-url http://localhost:$PORT \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $NUM_PROMPTS \
---max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ --result-filename $RESULT_FILENAME.json
\ No newline at end of file
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$NUM_PROMPTS" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh
index 741ecdb92..a22536d82 100644
--- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh
@@ -81,16 +81,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
 done
 kill $TAIL_PID
 
+# Source benchmark utilities
+source "$(dirname "$0")/benchmark_lib.sh"
+
 set -x
-BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
-git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
-python3 $BENCH_SERVING_DIR/benchmark_serving.py \
---model $MODEL --backend openai \
---base-url http://0.0.0.0:$PORT \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend openai \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts $(( $CONC * 10 )) \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
diff --git a/benchmarks/dsr1_fp8_h200_slurm.sh b/benchmarks/dsr1_fp8_h200_slurm.sh
index 14e3c2a7b..7444f763f 100644
--- a/benchmarks/dsr1_fp8_h200_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_slurm.sh
@@ -53,16 +53,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
 done
 kill $TAIL_PID
 
+# Source benchmark utilities
+source "$(dirname "$0")/benchmark_lib.sh"
+
 set -x
-BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
-git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
-python3 $BENCH_SERVING_DIR/benchmark_serving.py \
---model $MODEL --backend vllm \
---base-url http://0.0.0.0:$PORT \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts $(( $CONC * 10 )) \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
index 15647bbd2..94baa7850 100644
--- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
@@ -85,16 +85,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
 done
 kill $TAIL_PID
 
+# Source benchmark utilities
+source "$(dirname "$0")/benchmark_lib.sh"
+
 set -x
-BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
-git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
-python3 $BENCH_SERVING_DIR/benchmark_serving.py \
---model $MODEL --backend openai \
---base-url http://0.0.0.0:$PORT \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend openai \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts $(( $CONC * 10 )) \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
diff --git a/benchmarks/dsr1_fp8_mi300x_docker.sh b/benchmarks/dsr1_fp8_mi300x_docker.sh
index 033b84ed0..5ffebb941 100644
--- a/benchmarks/dsr1_fp8_mi300x_docker.sh
+++ b/benchmarks/dsr1_fp8_mi300x_docker.sh
@@ -47,15 +47,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
 done
 kill $TAIL_PID
 
+# Source benchmark utilities
+source "$(dirname "$0")/benchmark_lib.sh"
+
 set -x
-BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
-git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
-python3 $BENCH_SERVING_DIR/benchmark_serving.py \
---model=$MODEL --backend=vllm --base-url=http://0.0.0.0:$PORT \
---dataset-name=random \
---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
---num-prompts=$(( $CONC * 10 )) \
---max-concurrency=$CONC \
---request-rate=inf --ignore-eos \
---save-result --percentile-metrics="ttft,tpot,itl,e2el" \
---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts $(( $CONC * 10 )) \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
diff --git a/benchmarks/dsr1_fp8_mi300x_slurm.sh b/benchmarks/dsr1_fp8_mi300x_slurm.sh
index 31fe1bf55..ba1597982 100644
--- a/benchmarks/dsr1_fp8_mi300x_slurm.sh
+++ b/benchmarks/dsr1_fp8_mi300x_slurm.sh
@@ -56,16 +56,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
 done
 kill $TAIL_PID
 
+# Source benchmark utilities
+source "$(dirname "$0")/benchmark_lib.sh"
+
 set -x
-BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
-git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
-python3 $BENCH_SERVING_DIR/benchmark_serving.py \
---model=$MODEL --backend=vllm \
---base-url="http://0.0.0.0:$PORT" \
---dataset-name=random \
---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
---num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \
---request-rate=inf --ignore-eos \
---save-result --percentile-metrics='ttft,tpot,itl,e2el' \
---result-dir=/workspace/ \
---result-filename=$RESULT_FILENAME.json
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts $(( $CONC * 10 )) \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
diff --git a/benchmarks/dsr1_fp8_mi325x_docker.sh b/benchmarks/dsr1_fp8_mi325x_docker.sh
index 41f77ebd3..c0f95846d 100644
--- a/benchmarks/dsr1_fp8_mi325x_docker.sh
+++ b/benchmarks/dsr1_fp8_mi325x_docker.sh
@@ -37,16 +37,19 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
 done
 kill $TAIL_PID
 
+# Source benchmark utilities
+source "$(dirname "$0")/benchmark_lib.sh"
+
 set -x
-BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
-git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
-python3 $BENCH_SERVING_DIR/benchmark_serving.py \
---model=$MODEL --backend=vllm --base-url=http://0.0.0.0:$PORT \
---dataset-name=random \
---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
---num-prompts=$(( $CONC * 10 )) \
---max-concurrency=$CONC \
---request-rate=inf --ignore-eos \
---save-result --percentile-metrics="ttft,tpot,itl,e2el" \
---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts $(( $CONC * 10 )) \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
     
diff --git a/benchmarks/dsr1_fp8_mi325x_slurm.sh b/benchmarks/dsr1_fp8_mi325x_slurm.sh
index f9da69095..1ccec681f 100644
--- a/benchmarks/dsr1_fp8_mi325x_slurm.sh
+++ b/benchmarks/dsr1_fp8_mi325x_slurm.sh
@@ -32,16 +32,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
 done
 kill $TAIL_PID
 
+# Source benchmark utilities
+source "$(dirname "$0")/benchmark_lib.sh"
+
 set -x
-BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
-git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
-python3 $BENCH_SERVING_DIR/benchmark_serving.py \
---model $MODEL --backend vllm \
---base-url http://0.0.0.0:$PORT \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts $(( $CONC * 10 )) \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
diff --git a/benchmarks/dsr1_fp8_mi355x_docker.sh b/benchmarks/dsr1_fp8_mi355x_docker.sh
index 2ee734495..50d9bb02d 100644
--- a/benchmarks/dsr1_fp8_mi355x_docker.sh
+++ b/benchmarks/dsr1_fp8_mi355x_docker.sh
@@ -47,18 +47,21 @@ else
   NUM_PROMPTS=$(( CONC * 10 ))
 fi
 
+# Source benchmark utilities
+source "$(dirname "$0")/benchmark_lib.sh"
+
 set -x
-BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
-git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
-python3 $BENCH_SERVING_DIR/benchmark_serving.py \
---model=$MODEL --backend=vllm --base-url="http://localhost:$PORT" \
---dataset-name=random \
---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
---num-prompts=$NUM_PROMPTS \
---max-concurrency=$CONC \
---request-rate=inf --ignore-eos \
---save-result --percentile-metrics="ttft,tpot,itl,e2el" \
---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$NUM_PROMPTS" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
 
 
     
diff --git a/benchmarks/dsr1_fp8_mi355x_slurm.sh b/benchmarks/dsr1_fp8_mi355x_slurm.sh
index 0bdc36024..86b5f9649 100644
--- a/benchmarks/dsr1_fp8_mi355x_slurm.sh
+++ b/benchmarks/dsr1_fp8_mi355x_slurm.sh
@@ -41,15 +41,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
 done
 kill $TAIL_PID
 
+# Source benchmark utilities
+source "$(dirname "$0")/benchmark_lib.sh"
+
 set -x
-BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
-git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
-python3 $BENCH_SERVING_DIR/benchmark_serving.py \
---model $MODEL --backend vllm \
---base-url "http://0.0.0.0:$PORT" \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics "ttft,tpot,itl,e2el" \
---result-dir /workspace/ --result-filename $RESULT_FILENAME.json
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts $(( $CONC * 10 )) \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh
index 878564e11..208ea278d 100644
--- a/benchmarks/gptoss_fp4_b200_docker.sh
+++ b/benchmarks/gptoss_fp4_b200_docker.sh
@@ -63,15 +63,19 @@ done
 kill $TAIL_PID
 
 pip install -q datasets pandas
+
+# Source benchmark utilities
+source "$(dirname "$0")/benchmark_lib.sh"
+
 set -x
-BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
-git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
-python3 $BENCH_SERVING_DIR/benchmark_serving.py \
---model $MODEL --backend vllm --base-url http://localhost:$PORT \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $NUM_PROMPTS \
---max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ --result-filename $RESULT_FILENAME.json
\ No newline at end of file
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$NUM_PROMPTS" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh
index 92477dd56..4647cb346 100644
--- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh
+++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh
@@ -88,16 +88,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
 done
 kill $TAIL_PID
 
+# Source benchmark utilities
+source "$(dirname "$0")/benchmark_lib.sh"
+
 set -x
-BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
-git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
-python3 $BENCH_SERVING_DIR/benchmark_serving.py \
---model $MODEL --backend openai \
---base-url http://0.0.0.0:$PORT \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend openai \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts $(( $CONC * 10 )) \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh
index 6b95fae1a..42bbf6b1a 100644
--- a/benchmarks/gptoss_fp4_h100_docker.sh
+++ b/benchmarks/gptoss_fp4_h100_docker.sh
@@ -48,17 +48,19 @@ done
 kill $TAIL_PID
 
 pip install -q datasets pandas
+
+# Source benchmark utilities
+source "$(dirname "$0")/benchmark_lib.sh"
+
 set -x
-BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
-git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
-python3 $BENCH_SERVING_DIR/benchmark_serving.py \
---model=$MODEL \
---backend=vllm \
---base-url=http://localhost:$PORT \
---dataset-name=random \
---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
---num-prompts=$(( $CONC * 10 )) --max-concurrency=512 \
---request-rate=inf --ignore-eos \
---save-result --percentile-metrics='ttft,tpot,itl,e2el' \
---result-dir=/workspace/ \
---result-filename=$RESULT_FILENAME.json
\ No newline at end of file
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts $(( $CONC * 10 )) \
+    --max-concurrency 512 \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh
index d82bebf72..5f31f0abf 100644
--- a/benchmarks/gptoss_fp4_h100_slurm.sh
+++ b/benchmarks/gptoss_fp4_h100_slurm.sh
@@ -45,17 +45,19 @@ done
 kill $TAIL_PID
 
 pip install -q datasets pandas
+
+# Source benchmark utilities
+source "$(dirname "$0")/benchmark_lib.sh"
+
 set -x
-BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
-git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
-python3 $BENCH_SERVING_DIR/benchmark_serving.py \
---model=$MODEL \
---backend=vllm \
---base-url="http://0.0.0.0:$PORT" \
---dataset-name=random \
---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
---num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \
---request-rate=inf --ignore-eos \
---save-result --percentile-metrics='ttft,tpot,itl,e2el' \
---result-dir=/workspace/ \
---result-filename=$RESULT_FILENAME.json
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts $(( $CONC * 10 )) \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh
index 37851d39c..146ab16a5 100644
--- a/benchmarks/gptoss_fp4_h200_slurm.sh
+++ b/benchmarks/gptoss_fp4_h200_slurm.sh
@@ -57,16 +57,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
 done
 kill $TAIL_PID
 
+# Source benchmark utilities
+source "$(dirname "$0")/benchmark_lib.sh"
+
 set -x
-BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
-git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
-python3 $BENCH_SERVING_DIR/benchmark_serving.py \
---model $MODEL --backend vllm \
---base-url http://0.0.0.0:$PORT \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts $(( $CONC * 10 )) \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh
index 0927a0d61..ffe6e65de 100644
--- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh
+++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh
@@ -68,16 +68,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
 done
 kill $TAIL_PID
 
+# Source benchmark utilities
+source "$(dirname "$0")/benchmark_lib.sh"
+
 set -x
-BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
-git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
-python3 $BENCH_SERVING_DIR/benchmark_serving.py \
---model $MODEL --backend openai \
---base-url http://0.0.0.0:$PORT \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend openai \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts $(( $CONC * 10 )) \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh
index 0b03900be..a3b1dc4f3 100644
--- a/benchmarks/gptoss_fp4_mi300x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi300x_docker.sh
@@ -47,15 +47,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
 done
 kill $TAIL_PID
 
+# Source benchmark utilities
+source "$(dirname "$0")/benchmark_lib.sh"
+
 set -x
-BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
-git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
-python3 $BENCH_SERVING_DIR/benchmark_serving.py \
---model=$MODEL --backend=vllm --base-url=http://0.0.0.0:$PORT \
---dataset-name=random \
---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
---num-prompts=$(( $CONC * 10 )) \
---max-concurrency=$CONC \
---request-rate=inf --ignore-eos \
---save-result --percentile-metrics="ttft,tpot,itl,e2el" \
---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json
\ No newline at end of file
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts $(( $CONC * 10 )) \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_mi300x_slurm.sh b/benchmarks/gptoss_fp4_mi300x_slurm.sh
index 0e4a0b3b2..053c79197 100644
--- a/benchmarks/gptoss_fp4_mi300x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi300x_slurm.sh
@@ -57,15 +57,17 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
 done
 kill $TAIL_PID
 
-BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
-git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
-python3 $BENCH_SERVING_DIR/benchmark_serving.py \
---model $MODEL --backend vllm \
---base-url http://0.0.0.0:$PORT \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json
+# Source benchmark utilities
+source "$(dirname "$0")/benchmark_lib.sh"
+
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts $(( $CONC * 10 )) \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
diff --git a/benchmarks/gptoss_fp4_mi325x_docker.sh b/benchmarks/gptoss_fp4_mi325x_docker.sh
index c57446da3..62d2c5bd0 100644
--- a/benchmarks/gptoss_fp4_mi325x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi325x_docker.sh
@@ -46,15 +46,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
 done
 kill $TAIL_PID
 
+# Source benchmark utilities
+source "$(dirname "$0")/benchmark_lib.sh"
+
 set -x
-BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
-git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
-python3 $BENCH_SERVING_DIR/benchmark_serving.py \
---model=$MODEL --backend=vllm --base-url=http://0.0.0.0:$PORT \
---dataset-name=random \
---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
---num-prompts=$(( $CONC * 10 )) \
---max-concurrency=$CONC \
---request-rate=inf --ignore-eos \
---save-result --percentile-metrics="ttft,tpot,itl,e2el" \
---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts $(( $CONC * 10 )) \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh
index 9cbef3276..c1ac421b6 100644
--- a/benchmarks/gptoss_fp4_mi325x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh
@@ -57,16 +57,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
 done
 kill $TAIL_PID
 
+# Source benchmark utilities
+source "$(dirname "$0")/benchmark_lib.sh"
+
 set -x
-BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
-git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
-python3 $BENCH_SERVING_DIR/benchmark_serving.py \
---model $MODEL --backend vllm \
---base-url http://0.0.0.0:$PORT \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts $(( $CONC * 10 )) \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh
index 8209857bd..b26bf11b5 100644
--- a/benchmarks/gptoss_fp4_mi355x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi355x_docker.sh
@@ -45,15 +45,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
 done
 kill $TAIL_PID
 
+# Source benchmark utilities
+source "$(dirname "$0")/benchmark_lib.sh"
+
 set -x
-BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
-git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
-python3 $BENCH_SERVING_DIR/benchmark_serving.py \
---model=$MODEL --backend=vllm --base-url="http://localhost:$PORT" \
---dataset-name=random \
---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
---num-prompts=$NUM_PROMPTS \
---max-concurrency=$CONC \
---request-rate=inf --ignore-eos \
---save-result --percentile-metrics="ttft,tpot,itl,e2el" \
---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$NUM_PROMPTS" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
diff --git a/benchmarks/gptoss_fp4_mi355x_slurm.sh b/benchmarks/gptoss_fp4_mi355x_slurm.sh
index 1fcba771f..d378685db 100644
--- a/benchmarks/gptoss_fp4_mi355x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi355x_slurm.sh
@@ -47,15 +47,18 @@ until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
 done
 kill $TAIL_PID
 
+# Source benchmark utilities
+source "$(dirname "$0")/benchmark_lib.sh"
+
 set -x
-BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
-git clone https://github.com/kimbochen/bench_serving.git $BENCH_SERVING_DIR
-python3 $BENCH_SERVING_DIR/benchmark_serving.py \
---model $MODEL --backend vllm \
---base-url "http://0.0.0.0:$PORT" \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics "ttft,tpot,itl,e2el" \
---result-dir /workspace/ --result-filename $RESULT_FILENAME.json
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts $(( $CONC * 10 )) \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/

From ebe3b626ae96bd301f41bff90d71496290a8a3d7 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 14 Nov 2025 16:09:36 -0600
Subject: [PATCH 043/214] add utils function for benchmark

---
 benchmarks/benchmark_lib.sh          | 1 +
 benchmarks/gptoss_fp4_h100_docker.sh | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 152b5e4b6..f253f18f4 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -124,6 +124,7 @@ run_benchmark_serving() {
     git clone https://github.com/kimbochen/bench_serving.git "$BENCH_SERVING_DIR"
 
     # Run benchmark
+    set -x
     python3 "$BENCH_SERVING_DIR/benchmark_serving.py" \
         --model "$model" \
         --backend "$backend" \
diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh
index 42bbf6b1a..f2b17f990 100644
--- a/benchmarks/gptoss_fp4_h100_docker.sh
+++ b/benchmarks/gptoss_fp4_h100_docker.sh
@@ -52,7 +52,6 @@ pip install -q datasets pandas
 # Source benchmark utilities
 source "$(dirname "$0")/benchmark_lib.sh"
 
-set -x
 run_benchmark_serving \
     --model "$MODEL" \
     --port "$PORT" \

From aa9070ffbedd7878bb3cc5ac1e25ea97a59f9ccf Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 14 Nov 2025 16:30:16 -0600
Subject: [PATCH 044/214] function-ize the waiting for server to start

---
 benchmarks/benchmark_lib.sh             | 68 +++++++++++++++++++++++++
 benchmarks/dsr1_fp4_b200_docker.sh      | 16 +++---
 benchmarks/dsr1_fp4_b200_trt_slurm.sh   | 13 ++---
 benchmarks/dsr1_fp4_mi355x_docker.sh    | 12 ++---
 benchmarks/dsr1_fp4_mi355x_slurm.sh     | 12 ++---
 benchmarks/dsr1_fp8_b200_docker.sh      | 16 +++---
 benchmarks/dsr1_fp8_b200_trt_slurm.sh   | 12 ++---
 benchmarks/dsr1_fp8_h200_slurm.sh       | 12 ++---
 benchmarks/dsr1_fp8_h200_trt_slurm.sh   | 16 ++----
 benchmarks/dsr1_fp8_mi300x_docker.sh    |  3 ++
 benchmarks/dsr1_fp8_mi300x_slurm.sh     | 12 ++---
 benchmarks/dsr1_fp8_mi325x_docker.sh    | 12 ++---
 benchmarks/dsr1_fp8_mi325x_slurm.sh     | 12 ++---
 benchmarks/dsr1_fp8_mi355x_docker.sh    | 16 ++----
 benchmarks/dsr1_fp8_mi355x_slurm.sh     | 12 ++---
 benchmarks/gptoss_fp4_b200_docker.sh    | 16 ++----
 benchmarks/gptoss_fp4_b200_trt_slurm.sh | 13 ++---
 benchmarks/gptoss_fp4_h100_docker.sh    | 20 ++------
 benchmarks/gptoss_fp4_h100_slurm.sh     | 16 +++---
 benchmarks/gptoss_fp4_h200_slurm.sh     | 12 ++---
 benchmarks/gptoss_fp4_h200_trt_slurm.sh | 12 ++---
 benchmarks/gptoss_fp4_mi300x_docker.sh  | 12 ++---
 benchmarks/gptoss_fp4_mi300x_slurm.sh   | 12 ++---
 benchmarks/gptoss_fp4_mi325x_docker.sh  | 12 ++---
 benchmarks/gptoss_fp4_mi325x_slurm.sh   | 12 ++---
 benchmarks/gptoss_fp4_mi355x_docker.sh  | 12 ++---
 benchmarks/gptoss_fp4_mi355x_slurm.sh   | 12 ++---
 27 files changed, 170 insertions(+), 235 deletions(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index f253f18f4..133f9095f 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -2,6 +2,74 @@
 
 # Shared benchmarking utilities for InferenceMAX
 
+# Wait for server to be ready by polling the health endpoint
+# All parameters are required
+# Parameters:
+#   --port: Server port
+#   --server-log: Path to server log file
+#   --server-pid: Server process ID (required)
+#   --sleep-interval: Sleep interval between health checks (optional, default: 5)
+wait_for_server_ready() {
+    local port=""
+    local server_log=""
+    local server_pid=""
+    local sleep_interval=5
+
+    # Parse arguments
+    while [[ $# -gt 0 ]]; do
+        case $1 in
+            --port)
+                port="$2"
+                shift 2
+                ;;
+            --server-log)
+                server_log="$2"
+                shift 2
+                ;;
+            --server-pid)
+                server_pid="$2"
+                shift 2
+                ;;
+            --sleep-interval)
+                sleep_interval="$2"
+                shift 2
+                ;;
+            *)
+                echo "Unknown parameter: $1"
+                return 1
+                ;;
+        esac
+    done
+
+    # Validate required parameters
+    if [[ -z "$port" ]]; then
+        echo "Error: --port is required"
+        return 1
+    fi
+    if [[ -z "$server_log" ]]; then
+        echo "Error: --server-log is required"
+        return 1
+    fi
+    if [[ -z "$server_pid" ]]; then
+        echo "Error: --server-pid is required"
+        return 1
+    fi
+
+    # Show logs until server is ready
+    tail -f "$server_log" &
+    local TAIL_PID=$!
+    set +x
+    until curl --output /dev/null --silent --fail http://0.0.0.0:$port/health; do
+        if ! kill -0 "$server_pid" 2>/dev/null; then
+            echo "Server died before becoming healthy. Exiting."
+            kill $TAIL_PID
+            exit 1
+        fi
+        sleep "$sleep_interval"
+    done
+    kill $TAIL_PID
+}
+
 # Run benchmark serving with standardized parameters
 # All parameters are required
 # Parameters:
diff --git a/benchmarks/dsr1_fp4_b200_docker.sh b/benchmarks/dsr1_fp4_b200_docker.sh
index 317015ba8..5f4ab3c5c 100644
--- a/benchmarks/dsr1_fp4_b200_docker.sh
+++ b/benchmarks/dsr1_fp4_b200_docker.sh
@@ -26,20 +26,16 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.
 --ep-size $EP_SIZE --quantization modelopt_fp4 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
 --enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 > $SERVER_LOG 2>&1 &
 
-# Show logs until server is ready
-tail -f $SERVER_LOG &
-TAIL_PID=$!
-set +x
-until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
-    sleep 5
-done
-kill $TAIL_PID
-
-pip install -q datasets pandas
+SERVER_PID=$!
 
 # Source benchmark utilities
 source "$(dirname "$0")/benchmark_lib.sh"
 
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+pip install -q datasets pandas
+
 set -x
 run_benchmark_serving \
     --model "$MODEL" \
diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
index 897ef8527..a9f7cc9d4 100644
--- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
@@ -100,19 +100,14 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \
     --extra_llm_api_options=$EXTRA_CONFIG_FILE \
     > $SERVER_LOG 2>&1 &
 
-
-# Show logs until server is ready
-tail -f $SERVER_LOG &
-TAIL_PID=$!
-set +x
-until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
-    sleep 5
-done
-kill $TAIL_PID
+SERVER_PID=$!
 
 # Source benchmark utilities
 source "$(dirname "$0")/benchmark_lib.sh"
 
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
 set -x
 run_benchmark_serving \
     --model "$MODEL" \
diff --git a/benchmarks/dsr1_fp4_mi355x_docker.sh b/benchmarks/dsr1_fp4_mi355x_docker.sh
index 05603ecae..eed9d1273 100644
--- a/benchmarks/dsr1_fp4_mi355x_docker.sh
+++ b/benchmarks/dsr1_fp4_mi355x_docker.sh
@@ -31,18 +31,14 @@ python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \
 --max-prefill-tokens=$PREFILL_SIZE \
 --cuda-graph-max-bs=128 > $SERVER_LOG 2>&1 &
 
-# Show logs until server is ready
-tail -f $SERVER_LOG &
-TAIL_PID=$!
-set +x
-until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
-    sleep 5
-done
-kill $TAIL_PID
+SERVER_PID=$!
 
 # Source benchmark utilities
 source "$(dirname "$0")/benchmark_lib.sh"
 
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
 set -x
 run_benchmark_serving \
     --model "$MODEL" \
diff --git a/benchmarks/dsr1_fp4_mi355x_slurm.sh b/benchmarks/dsr1_fp4_mi355x_slurm.sh
index c47bbfb38..afb7ca29c 100644
--- a/benchmarks/dsr1_fp4_mi355x_slurm.sh
+++ b/benchmarks/dsr1_fp4_mi355x_slurm.sh
@@ -34,18 +34,12 @@ python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \
 --cuda-graph-max-bs=128 \
 > $SERVER_LOG 2>&1 &
 
-# Show logs until server is ready
-tail -f $SERVER_LOG &
-TAIL_PID=$!
-set +x
-until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
-    sleep 5
-done
-kill $TAIL_PID
-
 # Source benchmark utilities
 source "$(dirname "$0")/benchmark_lib.sh"
 
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
 set -x
 run_benchmark_serving \
     --model "$MODEL" \
diff --git a/benchmarks/dsr1_fp8_b200_docker.sh b/benchmarks/dsr1_fp8_b200_docker.sh
index fa498ff3e..9a219339c 100644
--- a/benchmarks/dsr1_fp8_b200_docker.sh
+++ b/benchmarks/dsr1_fp8_b200_docker.sh
@@ -37,20 +37,16 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.
 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL --disable-radix-cache \
 --attention-backend trtllm_mla --stream-interval 30 --ep-size $EP_SIZE --moe-runner-backend flashinfer_trtllm --quantization fp8 > $SERVER_LOG 2>&1 &
 
-# Show logs until server is ready
-tail -f $SERVER_LOG &
-TAIL_PID=$!
-set +x
-until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
-    sleep 5
-done
-kill $TAIL_PID
-
-pip install -q datasets pandas
+SERVER_PID=$!
 
 # Source benchmark utilities
 source "$(dirname "$0")/benchmark_lib.sh"
 
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+pip install -q datasets pandas
+
 set -x
 run_benchmark_serving \
     --model "$MODEL" \
diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh
index a22536d82..a78fece38 100644
--- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh
@@ -72,18 +72,12 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \
 
 SERVER_PID=$!
 
-# Show logs until server is ready
-tail -f $SERVER_LOG &
-TAIL_PID=$!
-set +x
-until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
-    sleep 5
-done
-kill $TAIL_PID
-
 # Source benchmark utilities
 source "$(dirname "$0")/benchmark_lib.sh"
 
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
 set -x
 run_benchmark_serving \
     --model "$MODEL" \
diff --git a/benchmarks/dsr1_fp8_h200_slurm.sh b/benchmarks/dsr1_fp8_h200_slurm.sh
index 7444f763f..6eeec6df1 100644
--- a/benchmarks/dsr1_fp8_h200_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_slurm.sh
@@ -44,18 +44,14 @@ else
     > $SERVER_LOG 2>&1 &
 fi
 
-# Show logs until server is ready
-tail -f $SERVER_LOG &
-TAIL_PID=$!
-set +x
-until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
-    sleep 5
-done
-kill $TAIL_PID
+SERVER_PID=$!
 
 # Source benchmark utilities
 source "$(dirname "$0")/benchmark_lib.sh"
 
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
 set -x
 run_benchmark_serving \
     --model "$MODEL" \
diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
index 94baa7850..74b2ce8df 100644
--- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
@@ -72,22 +72,12 @@ PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \
     
 SERVER_PID=$!
 
-# Show logs until server is ready
-tail -f $SERVER_LOG &
-TAIL_PID=$!
-set +x
-until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
-    if ! kill -0 $SERVER_PID 2>/dev/null; then
-        echo "Server died before becoming healthy. Exiting."
-        exit 1
-    fi
-    sleep 5
-done
-kill $TAIL_PID
-
 # Source benchmark utilities
 source "$(dirname "$0")/benchmark_lib.sh"
 
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
 set -x
 run_benchmark_serving \
     --model "$MODEL" \
diff --git a/benchmarks/dsr1_fp8_mi300x_docker.sh b/benchmarks/dsr1_fp8_mi300x_docker.sh
index 5ffebb941..db27f4e74 100644
--- a/benchmarks/dsr1_fp8_mi300x_docker.sh
+++ b/benchmarks/dsr1_fp8_mi300x_docker.sh
@@ -50,6 +50,9 @@ kill $TAIL_PID
 # Source benchmark utilities
 source "$(dirname "$0")/benchmark_lib.sh"
 
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
 set -x
 run_benchmark_serving \
     --model "$MODEL" \
diff --git a/benchmarks/dsr1_fp8_mi300x_slurm.sh b/benchmarks/dsr1_fp8_mi300x_slurm.sh
index ba1597982..0d191299c 100644
--- a/benchmarks/dsr1_fp8_mi300x_slurm.sh
+++ b/benchmarks/dsr1_fp8_mi300x_slurm.sh
@@ -47,18 +47,14 @@ python3 -m sglang.launch_server \
 --disable-radix-cache \
 > $SERVER_LOG 2>&1 &
 
-# Show logs until server is ready
-tail -f $SERVER_LOG &
-TAIL_PID=$!
-set +x
-until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
-    sleep 5
-done
-kill $TAIL_PID
+SERVER_PID=$!
 
 # Source benchmark utilities
 source "$(dirname "$0")/benchmark_lib.sh"
 
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
 set -x
 run_benchmark_serving \
     --model "$MODEL" \
diff --git a/benchmarks/dsr1_fp8_mi325x_docker.sh b/benchmarks/dsr1_fp8_mi325x_docker.sh
index c0f95846d..9f34e7563 100644
--- a/benchmarks/dsr1_fp8_mi325x_docker.sh
+++ b/benchmarks/dsr1_fp8_mi325x_docker.sh
@@ -28,18 +28,14 @@ python3 -m sglang.launch_server \
     --max-prefill-tokens 196608 \
     --cuda-graph-max-bs 128 > $SERVER_LOG 2>&1 &
 
-# Show logs until server is ready
-tail -f $SERVER_LOG &
-TAIL_PID=$!
-set +x
-until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
-    sleep 5
-done
-kill $TAIL_PID
+SERVER_PID=$!
 
 # Source benchmark utilities
 source "$(dirname "$0")/benchmark_lib.sh"
 
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
 set -x
 run_benchmark_serving \
     --model "$MODEL" \
diff --git a/benchmarks/dsr1_fp8_mi325x_slurm.sh b/benchmarks/dsr1_fp8_mi325x_slurm.sh
index 1ccec681f..15e9cce64 100644
--- a/benchmarks/dsr1_fp8_mi325x_slurm.sh
+++ b/benchmarks/dsr1_fp8_mi325x_slurm.sh
@@ -23,18 +23,12 @@ python3 -m sglang.launch_server \
 --disable-radix-cache \
 > $SERVER_LOG 2>&1 &
 
-# Show logs until server is ready
-tail -f $SERVER_LOG &
-TAIL_PID=$!
-set +x
-until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
-    sleep 5
-done
-kill $TAIL_PID
-
 # Source benchmark utilities
 source "$(dirname "$0")/benchmark_lib.sh"
 
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
 set -x
 run_benchmark_serving \
     --model "$MODEL" \
diff --git a/benchmarks/dsr1_fp8_mi355x_docker.sh b/benchmarks/dsr1_fp8_mi355x_docker.sh
index 50d9bb02d..a86d7adbe 100644
--- a/benchmarks/dsr1_fp8_mi355x_docker.sh
+++ b/benchmarks/dsr1_fp8_mi355x_docker.sh
@@ -28,14 +28,11 @@ python3 -m sglang.launch_server \
     --max-prefill-tokens 196608 \
     --cuda-graph-max-bs 128 > $SERVER_LOG 2>&1 &
 
-# Show logs until server is ready
-tail -f $SERVER_LOG &
-TAIL_PID=$!
-set +x
-until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
-    sleep 5
-done
-kill $TAIL_PID
+# Source benchmark utilities
+source "$(dirname "$0")/benchmark_lib.sh"
+
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
 if [[ "$MODEL" == "amd/DeepSeek-R1-0528-MXFP4-Preview" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then
   if [[ "$OSL" == "8192" ]]; then
@@ -47,9 +44,6 @@ else
   NUM_PROMPTS=$(( CONC * 10 ))
 fi
 
-# Source benchmark utilities
-source "$(dirname "$0")/benchmark_lib.sh"
-
 set -x
 run_benchmark_serving \
     --model "$MODEL" \
diff --git a/benchmarks/dsr1_fp8_mi355x_slurm.sh b/benchmarks/dsr1_fp8_mi355x_slurm.sh
index 86b5f9649..54ba29fc0 100644
--- a/benchmarks/dsr1_fp8_mi355x_slurm.sh
+++ b/benchmarks/dsr1_fp8_mi355x_slurm.sh
@@ -32,18 +32,12 @@ python3 -m sglang.launch_server \
     --max-prefill-tokens 196608 \
     --cuda-graph-max-bs 128 > $SERVER_LOG 2>&1 &
 
-# Show logs until server is ready
-tail -f $SERVER_LOG &
-TAIL_PID=$!
-set +x
-until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
-    sleep 5
-done
-kill $TAIL_PID
-
 # Source benchmark utilities
 source "$(dirname "$0")/benchmark_lib.sh"
 
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
 set -x
 run_benchmark_serving \
     --model "$MODEL" \
diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh
index 208ea278d..60c1a1582 100644
--- a/benchmarks/gptoss_fp4_b200_docker.sh
+++ b/benchmarks/gptoss_fp4_b200_docker.sh
@@ -53,20 +53,14 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \
 
 SERVER_PID=$!
 
-# Show logs until server is ready
-tail -f $SERVER_LOG &
-TAIL_PID=$!
-set +x
-until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
-    sleep 5
-done
-kill $TAIL_PID
-
-pip install -q datasets pandas
-
 # Source benchmark utilities
 source "$(dirname "$0")/benchmark_lib.sh"
 
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+pip install -q datasets pandas
+
 set -x
 run_benchmark_serving \
     --model "$MODEL" \
diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh
index 4647cb346..0ec2f325f 100644
--- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh
+++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh
@@ -78,19 +78,14 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \
     --extra_llm_api_options=$EXTRA_CONFIG_FILE \
     > $SERVER_LOG 2>&1 &
 
-
-# Show logs until server is ready
-tail -f $SERVER_LOG &
-TAIL_PID=$!
-set +x
-until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
-    sleep 5
-done
-kill $TAIL_PID
+SERVER_PID=$!
 
 # Source benchmark utilities
 source "$(dirname "$0")/benchmark_lib.sh"
 
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
 set -x
 run_benchmark_serving \
     --model "$MODEL" \
diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh
index f2b17f990..9cf7c5275 100644
--- a/benchmarks/gptoss_fp4_h100_docker.sh
+++ b/benchmarks/gptoss_fp4_h100_docker.sh
@@ -34,24 +34,14 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
 
 SERVER_PID=$!
 
-# Show logs until server is ready
-tail -f $SERVER_LOG &
-TAIL_PID=$!
-set +x
-until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
-    if ! kill -0 $SERVER_PID 2>/dev/null; then
-        echo "Server died before becoming healthy. Exiting."
-        exit 1
-    fi
-    sleep 5
-done
-kill $TAIL_PID
-
-pip install -q datasets pandas
-
 # Source benchmark utilities
 source "$(dirname "$0")/benchmark_lib.sh"
 
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+pip install -q datasets pandas
+
 run_benchmark_serving \
     --model "$MODEL" \
     --port "$PORT" \
diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh
index 5f31f0abf..c3d598116 100644
--- a/benchmarks/gptoss_fp4_h100_slurm.sh
+++ b/benchmarks/gptoss_fp4_h100_slurm.sh
@@ -35,20 +35,16 @@ PYTHONNOUSERSITE=1 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
 --max-num-seqs=$CONC  \
 --disable-log-requests > $SERVER_LOG 2>&1 &
 
-# Show logs until server is ready
-tail -f $SERVER_LOG &
-TAIL_PID=$!
-set +x
-until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
-    sleep 5
-done
-kill $TAIL_PID
-
-pip install -q datasets pandas
+SERVER_PID=$!
 
 # Source benchmark utilities
 source "$(dirname "$0")/benchmark_lib.sh"
 
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+pip install -q datasets pandas
+
 set -x
 run_benchmark_serving \
     --model "$MODEL" \
diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh
index 146ab16a5..a3e47ca44 100644
--- a/benchmarks/gptoss_fp4_h200_slurm.sh
+++ b/benchmarks/gptoss_fp4_h200_slurm.sh
@@ -48,18 +48,14 @@ PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config
  --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC  \
  --disable-log-requests > $SERVER_LOG 2>&1 &
 
-# Show logs until server is ready
-tail -f $SERVER_LOG &
-TAIL_PID=$!
-set +x
-until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
-    sleep 5
-done
-kill $TAIL_PID
+SERVER_PID=$!
 
 # Source benchmark utilities
 source "$(dirname "$0")/benchmark_lib.sh"
 
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
 set -x
 run_benchmark_serving \
     --model "$MODEL" \
diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh
index ffe6e65de..81f1f67de 100644
--- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh
+++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh
@@ -59,18 +59,14 @@ trtllm-serve $MODEL \
 --pp_size=1 \
 > $SERVER_LOG 2>&1 &
 
-# Show logs until server is ready
-tail -f $SERVER_LOG &
-TAIL_PID=$!
-set +x
-until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
-    sleep 5
-done
-kill $TAIL_PID
+SERVER_PID=$!
 
 # Source benchmark utilities
 source "$(dirname "$0")/benchmark_lib.sh"
 
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
 set -x
 run_benchmark_serving \
     --model "$MODEL" \
diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh
index a3b1dc4f3..003ebf90e 100644
--- a/benchmarks/gptoss_fp4_mi300x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi300x_docker.sh
@@ -38,18 +38,12 @@ vllm serve $MODEL --port $PORT \
 --disable-log-requests \
 --async-scheduling > $SERVER_LOG 2>&1 &
 
-# Show logs until server is ready
-tail -f $SERVER_LOG &
-TAIL_PID=$!
-set +x
-until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
-    sleep 5
-done
-kill $TAIL_PID
-
 # Source benchmark utilities
 source "$(dirname "$0")/benchmark_lib.sh"
 
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
 set -x
 run_benchmark_serving \
     --model "$MODEL" \
diff --git a/benchmarks/gptoss_fp4_mi300x_slurm.sh b/benchmarks/gptoss_fp4_mi300x_slurm.sh
index 053c79197..a9e164cc2 100644
--- a/benchmarks/gptoss_fp4_mi300x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi300x_slurm.sh
@@ -48,18 +48,14 @@ vllm serve $MODEL --port $PORT \
 --async-scheduling \
 > $SERVER_LOG 2>&1 &
 
-# Show logs until server is ready
-tail -f $SERVER_LOG &
-TAIL_PID=$!
-set +x
-until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
-    sleep 5
-done
-kill $TAIL_PID
+SERVER_PID=$!
 
 # Source benchmark utilities
 source "$(dirname "$0")/benchmark_lib.sh"
 
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
 run_benchmark_serving \
     --model "$MODEL" \
     --port "$PORT" \
diff --git a/benchmarks/gptoss_fp4_mi325x_docker.sh b/benchmarks/gptoss_fp4_mi325x_docker.sh
index 62d2c5bd0..a000b462f 100644
--- a/benchmarks/gptoss_fp4_mi325x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi325x_docker.sh
@@ -37,18 +37,12 @@ vllm serve $MODEL --port $PORT \
 --disable-log-requests \
 --async-scheduling > $SERVER_LOG 2>&1 &
 
-# Show logs until server is ready
-tail -f $SERVER_LOG &
-TAIL_PID=$!
-set +x
-until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
-    sleep 5
-done
-kill $TAIL_PID
-
 # Source benchmark utilities
 source "$(dirname "$0")/benchmark_lib.sh"
 
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
 set -x
 run_benchmark_serving \
     --model "$MODEL" \
diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh
index c1ac421b6..a9dbff484 100644
--- a/benchmarks/gptoss_fp4_mi325x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh
@@ -48,18 +48,12 @@ vllm serve $MODEL --port $PORT \
 --async-scheduling \
 > $SERVER_LOG 2>&1 &
 
-# Show logs until server is ready
-tail -f $SERVER_LOG &
-TAIL_PID=$!
-set +x
-until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
-    sleep 5
-done
-kill $TAIL_PID
-
 # Source benchmark utilities
 source "$(dirname "$0")/benchmark_lib.sh"
 
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
 set -x
 run_benchmark_serving \
     --model "$MODEL" \
diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh
index b26bf11b5..e7399694f 100644
--- a/benchmarks/gptoss_fp4_mi355x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi355x_docker.sh
@@ -36,18 +36,12 @@ vllm serve $MODEL --port $PORT \
 --disable-log-requests \
 --async-scheduling > $SERVER_LOG 2>&1 &
 
-# Show logs until server is ready
-tail -f $SERVER_LOG &
-TAIL_PID=$!
-set +x
-until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
-    sleep 5
-done
-kill $TAIL_PID
-
 # Source benchmark utilities
 source "$(dirname "$0")/benchmark_lib.sh"
 
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
 set -x
 run_benchmark_serving \
     --model "$MODEL" \
diff --git a/benchmarks/gptoss_fp4_mi355x_slurm.sh b/benchmarks/gptoss_fp4_mi355x_slurm.sh
index d378685db..2f4d84927 100644
--- a/benchmarks/gptoss_fp4_mi355x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi355x_slurm.sh
@@ -38,18 +38,12 @@ vllm serve $MODEL --port $PORT \
 --disable-log-requests \
 --async-scheduling > $SERVER_LOG 2>&1 &
 
-# Show logs until server is ready
-tail -f $SERVER_LOG &
-TAIL_PID=$!
-set +x
-until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
-    sleep 5
-done
-kill $TAIL_PID
-
 # Source benchmark utilities
 source "$(dirname "$0")/benchmark_lib.sh"
 
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
 set -x
 run_benchmark_serving \
     --model "$MODEL" \

From 0d2c112775ed23d87722c923253a63e49d3fdeec Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 14 Nov 2025 16:31:55 -0600
Subject: [PATCH 045/214] dont show arg parsing set -x

---
 benchmarks/benchmark_lib.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 133f9095f..85d221a92 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -10,6 +10,7 @@
 #   --server-pid: Server process ID (required)
 #   --sleep-interval: Sleep interval between health checks (optional, default: 5)
 wait_for_server_ready() {
+    set -x
     local port=""
     local server_log=""
     local server_pid=""

From 271091d3e047fab771eabf5b6fada8a75cffa037 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 14 Nov 2025 16:36:10 -0600
Subject: [PATCH 046/214] dont show arg parsing set +x oops

---
 benchmarks/benchmark_lib.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 85d221a92..8e52b949d 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -10,7 +10,7 @@
 #   --server-pid: Server process ID (required)
 #   --sleep-interval: Sleep interval between health checks (optional, default: 5)
 wait_for_server_ready() {
-    set -x
+    set +x
     local port=""
     local server_log=""
     local server_pid=""

From 898b132bd48b430c97bf4c8ecdaa322ba80e410f Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 14 Nov 2025 16:44:58 -0600
Subject: [PATCH 047/214] dont show arg parsing set +x oops

---
 benchmarks/benchmark_lib.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 8e52b949d..cc3448d40 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -59,7 +59,6 @@ wait_for_server_ready() {
     # Show logs until server is ready
     tail -f "$server_log" &
     local TAIL_PID=$!
-    set +x
     until curl --output /dev/null --silent --fail http://0.0.0.0:$port/health; do
         if ! kill -0 "$server_pid" 2>/dev/null; then
             echo "Server died before becoming healthy. Exiting."
@@ -85,6 +84,7 @@ wait_for_server_ready() {
 #   --result-filename: Result filename without extension
 #   --result-dir: Result directory
 run_benchmark_serving() {
+    set +x
     local model=""
     local port=""
     local backend=""
@@ -210,4 +210,5 @@ run_benchmark_serving() {
         --percentile-metrics 'ttft,tpot,itl,e2el' \
         --result-dir "$result_dir" \
         --result-filename "$result_filename.json"
+    set +x
 }

From fd2e33e29c4042f311ba3c95590fe97a3e3ce04c Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 14 Nov 2025 17:04:18 -0600
Subject: [PATCH 048/214] capture server pid

---
 benchmarks/dsr1_fp4_b200_docker.sh      |  1 -
 benchmarks/dsr1_fp4_b200_trt_slurm.sh   |  1 -
 benchmarks/dsr1_fp4_mi355x_docker.sh    |  1 -
 benchmarks/dsr1_fp4_mi355x_slurm.sh     |  3 ++-
 benchmarks/dsr1_fp8_b200_docker.sh      |  1 -
 benchmarks/dsr1_fp8_b200_trt_slurm.sh   |  1 -
 benchmarks/dsr1_fp8_h200_slurm.sh       |  1 -
 benchmarks/dsr1_fp8_h200_trt_slurm.sh   |  1 -
 benchmarks/dsr1_fp8_mi300x_docker.sh    | 11 +----------
 benchmarks/dsr1_fp8_mi300x_slurm.sh     |  1 -
 benchmarks/dsr1_fp8_mi325x_docker.sh    |  1 -
 benchmarks/dsr1_fp8_mi325x_slurm.sh     |  3 ++-
 benchmarks/dsr1_fp8_mi355x_docker.sh    |  3 ++-
 benchmarks/dsr1_fp8_mi355x_slurm.sh     |  3 ++-
 benchmarks/gptoss_fp4_b200_docker.sh    |  1 -
 benchmarks/gptoss_fp4_h100_slurm.sh     |  1 -
 benchmarks/gptoss_fp4_h200_slurm.sh     |  1 -
 benchmarks/gptoss_fp4_h200_trt_slurm.sh |  1 -
 benchmarks/gptoss_fp4_mi300x_docker.sh  |  3 ++-
 benchmarks/gptoss_fp4_mi325x_docker.sh  |  3 ++-
 benchmarks/gptoss_fp4_mi325x_slurm.sh   |  3 ++-
 benchmarks/gptoss_fp4_mi355x_docker.sh  |  3 ++-
 benchmarks/gptoss_fp4_mi355x_slurm.sh   |  3 ++-
 23 files changed, 19 insertions(+), 32 deletions(-)

diff --git a/benchmarks/dsr1_fp4_b200_docker.sh b/benchmarks/dsr1_fp4_b200_docker.sh
index 5f4ab3c5c..a520871fa 100644
--- a/benchmarks/dsr1_fp4_b200_docker.sh
+++ b/benchmarks/dsr1_fp4_b200_docker.sh
@@ -36,7 +36,6 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 
 pip install -q datasets pandas
 
-set -x
 run_benchmark_serving \
     --model "$MODEL" \
     --port "$PORT" \
diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
index a9f7cc9d4..b4227e428 100644
--- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
@@ -108,7 +108,6 @@ source "$(dirname "$0")/benchmark_lib.sh"
 # Wait for server to be ready
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-set -x
 run_benchmark_serving \
     --model "$MODEL" \
     --port "$PORT" \
diff --git a/benchmarks/dsr1_fp4_mi355x_docker.sh b/benchmarks/dsr1_fp4_mi355x_docker.sh
index eed9d1273..f19b6df2e 100644
--- a/benchmarks/dsr1_fp4_mi355x_docker.sh
+++ b/benchmarks/dsr1_fp4_mi355x_docker.sh
@@ -39,7 +39,6 @@ source "$(dirname "$0")/benchmark_lib.sh"
 # Wait for server to be ready
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-set -x
 run_benchmark_serving \
     --model "$MODEL" \
     --port "$PORT" \
diff --git a/benchmarks/dsr1_fp4_mi355x_slurm.sh b/benchmarks/dsr1_fp4_mi355x_slurm.sh
index afb7ca29c..f4d7f1d39 100644
--- a/benchmarks/dsr1_fp4_mi355x_slurm.sh
+++ b/benchmarks/dsr1_fp4_mi355x_slurm.sh
@@ -34,13 +34,14 @@ python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \
 --cuda-graph-max-bs=128 \
 > $SERVER_LOG 2>&1 &
 
+SERVER_PID=$!
+
 # Source benchmark utilities
 source "$(dirname "$0")/benchmark_lib.sh"
 
 # Wait for server to be ready
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-set -x
 run_benchmark_serving \
     --model "$MODEL" \
     --port "$PORT" \
diff --git a/benchmarks/dsr1_fp8_b200_docker.sh b/benchmarks/dsr1_fp8_b200_docker.sh
index 9a219339c..ffa7644bd 100644
--- a/benchmarks/dsr1_fp8_b200_docker.sh
+++ b/benchmarks/dsr1_fp8_b200_docker.sh
@@ -47,7 +47,6 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 
 pip install -q datasets pandas
 
-set -x
 run_benchmark_serving \
     --model "$MODEL" \
     --port "$PORT" \
diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh
index a78fece38..a9a1a04ff 100644
--- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh
@@ -78,7 +78,6 @@ source "$(dirname "$0")/benchmark_lib.sh"
 # Wait for server to be ready
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-set -x
 run_benchmark_serving \
     --model "$MODEL" \
     --port "$PORT" \
diff --git a/benchmarks/dsr1_fp8_h200_slurm.sh b/benchmarks/dsr1_fp8_h200_slurm.sh
index 6eeec6df1..06345ecb2 100644
--- a/benchmarks/dsr1_fp8_h200_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_slurm.sh
@@ -52,7 +52,6 @@ source "$(dirname "$0")/benchmark_lib.sh"
 # Wait for server to be ready
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-set -x
 run_benchmark_serving \
     --model "$MODEL" \
     --port "$PORT" \
diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
index 74b2ce8df..4ece6f7bc 100644
--- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
@@ -78,7 +78,6 @@ source "$(dirname "$0")/benchmark_lib.sh"
 # Wait for server to be ready
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-set -x
 run_benchmark_serving \
     --model "$MODEL" \
     --port "$PORT" \
diff --git a/benchmarks/dsr1_fp8_mi300x_docker.sh b/benchmarks/dsr1_fp8_mi300x_docker.sh
index db27f4e74..8c269dd83 100644
--- a/benchmarks/dsr1_fp8_mi300x_docker.sh
+++ b/benchmarks/dsr1_fp8_mi300x_docker.sh
@@ -37,15 +37,7 @@ python3 -m sglang.launch_server \
 --max-prefill-tokens=196608 \
 --disable-radix-cache > $SERVER_LOG 2>&1 &
 
-
-# Show logs until server is ready
-tail -f $SERVER_LOG &
-TAIL_PID=$!
-set +x
-until curl --output /dev/null --silent --fail http://0.0.0.0:$PORT/health; do
-    sleep 5
-done
-kill $TAIL_PID
+SERVER_PID=$!
 
 # Source benchmark utilities
 source "$(dirname "$0")/benchmark_lib.sh"
@@ -53,7 +45,6 @@ source "$(dirname "$0")/benchmark_lib.sh"
 # Wait for server to be ready
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-set -x
 run_benchmark_serving \
     --model "$MODEL" \
     --port "$PORT" \
diff --git a/benchmarks/dsr1_fp8_mi300x_slurm.sh b/benchmarks/dsr1_fp8_mi300x_slurm.sh
index 0d191299c..5fad7a587 100644
--- a/benchmarks/dsr1_fp8_mi300x_slurm.sh
+++ b/benchmarks/dsr1_fp8_mi300x_slurm.sh
@@ -55,7 +55,6 @@ source "$(dirname "$0")/benchmark_lib.sh"
 # Wait for server to be ready
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-set -x
 run_benchmark_serving \
     --model "$MODEL" \
     --port "$PORT" \
diff --git a/benchmarks/dsr1_fp8_mi325x_docker.sh b/benchmarks/dsr1_fp8_mi325x_docker.sh
index 9f34e7563..565b8fb45 100644
--- a/benchmarks/dsr1_fp8_mi325x_docker.sh
+++ b/benchmarks/dsr1_fp8_mi325x_docker.sh
@@ -36,7 +36,6 @@ source "$(dirname "$0")/benchmark_lib.sh"
 # Wait for server to be ready
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-set -x
 run_benchmark_serving \
     --model "$MODEL" \
     --port "$PORT" \
diff --git a/benchmarks/dsr1_fp8_mi325x_slurm.sh b/benchmarks/dsr1_fp8_mi325x_slurm.sh
index 15e9cce64..67e4cc394 100644
--- a/benchmarks/dsr1_fp8_mi325x_slurm.sh
+++ b/benchmarks/dsr1_fp8_mi325x_slurm.sh
@@ -23,13 +23,14 @@ python3 -m sglang.launch_server \
 --disable-radix-cache \
 > $SERVER_LOG 2>&1 &
 
+SERVER_PID=$!
+
 # Source benchmark utilities
 source "$(dirname "$0")/benchmark_lib.sh"
 
 # Wait for server to be ready
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-set -x
 run_benchmark_serving \
     --model "$MODEL" \
     --port "$PORT" \
diff --git a/benchmarks/dsr1_fp8_mi355x_docker.sh b/benchmarks/dsr1_fp8_mi355x_docker.sh
index a86d7adbe..d4f1dd013 100644
--- a/benchmarks/dsr1_fp8_mi355x_docker.sh
+++ b/benchmarks/dsr1_fp8_mi355x_docker.sh
@@ -28,6 +28,8 @@ python3 -m sglang.launch_server \
     --max-prefill-tokens 196608 \
     --cuda-graph-max-bs 128 > $SERVER_LOG 2>&1 &
 
+SERVER_PID=$!
+
 # Source benchmark utilities
 source "$(dirname "$0")/benchmark_lib.sh"
 
@@ -44,7 +46,6 @@ else
   NUM_PROMPTS=$(( CONC * 10 ))
 fi
 
-set -x
 run_benchmark_serving \
     --model "$MODEL" \
     --port "$PORT" \
diff --git a/benchmarks/dsr1_fp8_mi355x_slurm.sh b/benchmarks/dsr1_fp8_mi355x_slurm.sh
index 54ba29fc0..fd6fe49fb 100644
--- a/benchmarks/dsr1_fp8_mi355x_slurm.sh
+++ b/benchmarks/dsr1_fp8_mi355x_slurm.sh
@@ -32,13 +32,14 @@ python3 -m sglang.launch_server \
     --max-prefill-tokens 196608 \
     --cuda-graph-max-bs 128 > $SERVER_LOG 2>&1 &
 
+SERVER_PID=$!
+
 # Source benchmark utilities
 source "$(dirname "$0")/benchmark_lib.sh"
 
 # Wait for server to be ready
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-set -x
 run_benchmark_serving \
     --model "$MODEL" \
     --port "$PORT" \
diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh
index 60c1a1582..1736701c4 100644
--- a/benchmarks/gptoss_fp4_b200_docker.sh
+++ b/benchmarks/gptoss_fp4_b200_docker.sh
@@ -61,7 +61,6 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 
 pip install -q datasets pandas
 
-set -x
 run_benchmark_serving \
     --model "$MODEL" \
     --port "$PORT" \
diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh
index c3d598116..843219b95 100644
--- a/benchmarks/gptoss_fp4_h100_slurm.sh
+++ b/benchmarks/gptoss_fp4_h100_slurm.sh
@@ -45,7 +45,6 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 
 pip install -q datasets pandas
 
-set -x
 run_benchmark_serving \
     --model "$MODEL" \
     --port "$PORT" \
diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh
index a3e47ca44..dc29baf8d 100644
--- a/benchmarks/gptoss_fp4_h200_slurm.sh
+++ b/benchmarks/gptoss_fp4_h200_slurm.sh
@@ -56,7 +56,6 @@ source "$(dirname "$0")/benchmark_lib.sh"
 # Wait for server to be ready
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-set -x
 run_benchmark_serving \
     --model "$MODEL" \
     --port "$PORT" \
diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh
index 81f1f67de..21d6ae02c 100644
--- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh
+++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh
@@ -67,7 +67,6 @@ source "$(dirname "$0")/benchmark_lib.sh"
 # Wait for server to be ready
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-set -x
 run_benchmark_serving \
     --model "$MODEL" \
     --port "$PORT" \
diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh
index 003ebf90e..7d1f98226 100644
--- a/benchmarks/gptoss_fp4_mi300x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi300x_docker.sh
@@ -38,13 +38,14 @@ vllm serve $MODEL --port $PORT \
 --disable-log-requests \
 --async-scheduling > $SERVER_LOG 2>&1 &
 
+SERVER_PID=$!
+
 # Source benchmark utilities
 source "$(dirname "$0")/benchmark_lib.sh"
 
 # Wait for server to be ready
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-set -x
 run_benchmark_serving \
     --model "$MODEL" \
     --port "$PORT" \
diff --git a/benchmarks/gptoss_fp4_mi325x_docker.sh b/benchmarks/gptoss_fp4_mi325x_docker.sh
index a000b462f..46462ad6d 100644
--- a/benchmarks/gptoss_fp4_mi325x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi325x_docker.sh
@@ -37,13 +37,14 @@ vllm serve $MODEL --port $PORT \
 --disable-log-requests \
 --async-scheduling > $SERVER_LOG 2>&1 &
 
+SERVER_PID=$!
+
 # Source benchmark utilities
 source "$(dirname "$0")/benchmark_lib.sh"
 
 # Wait for server to be ready
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-set -x
 run_benchmark_serving \
     --model "$MODEL" \
     --port "$PORT" \
diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh
index a9dbff484..f15e6261c 100644
--- a/benchmarks/gptoss_fp4_mi325x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh
@@ -48,13 +48,14 @@ vllm serve $MODEL --port $PORT \
 --async-scheduling \
 > $SERVER_LOG 2>&1 &
 
+SERVER_PID=$!
+
 # Source benchmark utilities
 source "$(dirname "$0")/benchmark_lib.sh"
 
 # Wait for server to be ready
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-set -x
 run_benchmark_serving \
     --model "$MODEL" \
     --port "$PORT" \
diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh
index e7399694f..0e54245d4 100644
--- a/benchmarks/gptoss_fp4_mi355x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi355x_docker.sh
@@ -36,13 +36,14 @@ vllm serve $MODEL --port $PORT \
 --disable-log-requests \
 --async-scheduling > $SERVER_LOG 2>&1 &
 
+SERVER_PID=$!
+
 # Source benchmark utilities
 source "$(dirname "$0")/benchmark_lib.sh"
 
 # Wait for server to be ready
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-set -x
 run_benchmark_serving \
     --model "$MODEL" \
     --port "$PORT" \
diff --git a/benchmarks/gptoss_fp4_mi355x_slurm.sh b/benchmarks/gptoss_fp4_mi355x_slurm.sh
index 2f4d84927..a2adf2952 100644
--- a/benchmarks/gptoss_fp4_mi355x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi355x_slurm.sh
@@ -38,13 +38,14 @@ vllm serve $MODEL --port $PORT \
 --disable-log-requests \
 --async-scheduling > $SERVER_LOG 2>&1 &
 
+SERVER_PID=$!
+
 # Source benchmark utilities
 source "$(dirname "$0")/benchmark_lib.sh"
 
 # Wait for server to be ready
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-set -x
 run_benchmark_serving \
     --model "$MODEL" \
     --port "$PORT" \

From 2a4faf5c9f26f02ee2f9e98c4fd1335b1c140fde Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 14 Nov 2025 17:50:57 -0600
Subject: [PATCH 049/214] Squash-merge bryan/eval into
 refactor-docker-runner-launch

---
 .github/workflows/eval-h100-gms8k.yml |  59 +++++++++
 .github/workflows/eval-tmpl.yml       | 136 +++++++++++++++++++
 benchmarks/dsr1_fp4_mi355x_slurm.sh   | 137 +++++++++++++++++++
 benchmarks/gptoss_fp4_h100_docker.sh  |   8 +-
 benchmarks/gptoss_fp4_h100_slurm.sh   |  12 +-
 benchmarks/gptoss_fp4_h200_slurm.sh   |   2 +-
 benchmarks/gptoss_fp4_mi300x_slurm.sh | 182 ++++++++++++++++++++++++++
 benchmarks/gptoss_fp4_mi325x_slurm.sh |   3 +-
 benchmarks/gptoss_fp4_mi355x_slurm.sh |   2 +
 9 files changed, 532 insertions(+), 9 deletions(-)
 create mode 100644 .github/workflows/eval-h100-gms8k.yml
 create mode 100644 .github/workflows/eval-tmpl.yml

diff --git a/.github/workflows/eval-h100-gms8k.yml b/.github/workflows/eval-h100-gms8k.yml
new file mode 100644
index 000000000..84723f596
--- /dev/null
+++ b/.github/workflows/eval-h100-gms8k.yml
@@ -0,0 +1,59 @@
+name: Eval - GSM8K on H100 (PoC)
+
+on:
+  workflow_dispatch:
+    inputs:
+      image:
+        description: "Serving image"
+        required: false
+        type: string
+        default: "vllm/vllm-openai:v0.11.0"
+      model:
+        description: "Model"
+        required: false
+        type: string
+        default: "openai/gpt-oss-120b"
+      tp:
+        description: "Tensor Parallel Size"
+        required: false
+        type: string
+        default: "4"
+      port:
+        description: "Server port"
+        required: false
+        type: string
+        default: "8888"
+      num_fewshot:
+        description: "Fewshot k for GSM8K"
+        required: false
+        type: string
+        default: "5"
+      limit:
+        description: "Sample limit for GSM8K"
+        required: false
+        type: string
+        default: "200"
+  push:
+    paths:
+      - '.github/workflows/eval-h100-gms8k.yml'
+      - '.github/workflows/eval-tmpl.yml'
+      - 'benchmarks/dsr1_fp8_mi325x_slurm.sh'
+
+jobs:
+  eval:
+    uses: ./.github/workflows/eval-tmpl.yml
+    secrets: inherit
+    with:
+      runner: mi325x-tw_1
+      image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' }}
+      model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }}
+      framework: sglang
+      precision: fp8
+      exp-name: dsr1_gsm8k_poc
+      tp: ${{ inputs.tp || '4' }}
+      ep: '1'
+      dp-attn: false
+      port: ${{ inputs.port || '8888' }}
+      eval-task: gsm8k
+      num-fewshot: ${{ inputs.num_fewshot || '5' }}
+      limit: ${{ inputs.limit || '200' }}
diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml
new file mode 100644
index 000000000..9c4e77c78
--- /dev/null
+++ b/.github/workflows/eval-tmpl.yml
@@ -0,0 +1,136 @@
+name: Template - Eval
+
+on:
+  workflow_call:
+    inputs:
+      runner:
+        required: true
+        type: string
+      image:
+        required: true
+        type: string
+      model:
+        required: true
+        type: string
+      framework:
+        required: true
+        type: string
+      precision:
+        required: true
+        type: string
+      exp-name:
+        required: true
+        type: string
+      tp:
+        required: true
+        type: string
+      ep:
+        required: false
+        type: string
+        default: '1'
+      dp-attn:
+        required: false
+        type: boolean
+        default: false
+      port:
+        required: false
+        type: string
+        default: '8888'
+      eval-task:
+        required: true
+        type: string
+      num-fewshot:
+        required: false
+        type: string
+        default: '5'
+      limit:
+        required: false
+        type: string
+        default: '200'
+
+env:
+  HF_TOKEN: ${{ secrets.HF_TOKEN }}
+  HF_HUB_CACHE: '/mnt/hf_hub_cache/'
+  EXP_NAME: ${{ inputs.exp-name }}
+  MODEL: ${{ inputs.model }}
+  IMAGE: ${{ inputs.image }}
+  FRAMEWORK: ${{ inputs.framework }}
+  PRECISION: ${{ inputs.precision }}
+  TP: ${{ inputs.tp }}
+  EP_SIZE: ${{ inputs.ep }}
+  DP_ATTENTION: ${{ inputs.dp-attn }}
+  PORT: ${{ inputs.port }}
+  EVAL_TASK: ${{ inputs['eval-task'] }}
+  NUM_FEWSHOT: ${{ inputs['num-fewshot'] }}
+  LIMIT: ${{ inputs.limit }}
+  EVAL_RESULT_DIR: eval_out
+  # Server-side concurrency default (used by some server scripts)
+  CONC: '32'
+  MAX_MODEL_LEN: '8192'
+  ISL: 1024
+  OSL: 8192
+  RANDOM_RANGE_RATIO: '1.0'
+  RESULT_FILENAME: results
+  
+jobs:
+  eval:
+    runs-on: ${{ inputs.runner }}
+    timeout-minutes: 180
+    name: "Eval ${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.precision }} tp=${{ inputs.tp }} task=${{ inputs['eval-task'] }} limit=${{ inputs.limit }}"
+    steps:
+      - name: Resource cleanup
+        run: |
+          if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then
+            host=$(hostname)
+
+            if [[ "$host" == "b200-81" || "$host" == "b200-80" || "$host" == "b200-79" ]]; then
+              echo "[INFO] Running container-by-container cleanup on $host"
+              for cid in $(docker ps -aq); do
+                echo "[INFO] Cleaning container $cid"
+                docker stop -t 90 "$cid" || true
+                docker wait "$cid" >/dev/null 2>&1 || true
+                docker rm -f "$cid" >/dev/null 2>&1 || true
+              done
+              sleep 2
+              if nvidia-smi --query-compute-apps=pid --format=csv,noheader | grep -q '[0-9]'; then
+                echo "[WARN] After stop, GPU still busy:"
+                nvidia-smi || true
+              fi
+            else
+              echo "[Docker] looking at docker resources ..."
+              docker ps -aq 
+            fi
+          fi
+          if command -v squeue >/dev/null 2>&1; then
+            echo "[Slurm] Cleaning up resources ..."
+            scancel -u $USER || true
+            while [ -n "$(squeue -u $USER --noheader --format='%i')" ]; do
+              squeue -u $USER || true
+              sleep 5
+            done
+          fi
+
+      - uses: actions/checkout@v5
+        with:
+          fetch-depth: 0
+          # Avoid aggressive workspace deletion if stale, rely on git reset/clean later
+          clean: false
+
+      - name: Launch eval via runner script
+        env:
+          RUNNER_NAME: ${{ runner.name }}
+          RUN_MODE: eval
+          # Optional: structured filename if runner chooses to use it later
+          EVAL_RESULT_BASENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_${{ runner.name }}
+        run: |
+          bash ./runners/launch_${RUNNER_NAME%%_*}.sh
+
+      - name: Upload eval artifacts
+        if: always()
+        uses: actions/upload-artifact@v5
+        with:
+          name: eval_${{ env.EXP_NAME }}_${{ runner.name }}
+          path: |
+            ${{ env.EVAL_RESULT_DIR }}/
+            ${{ env.EVAL_RESULT_DIR }}/*
+            ${{ env.EVAL_RESULT_DIR }}/**
diff --git a/benchmarks/dsr1_fp4_mi355x_slurm.sh b/benchmarks/dsr1_fp4_mi355x_slurm.sh
index f4d7f1d39..b0f1c33c0 100644
--- a/benchmarks/dsr1_fp4_mi355x_slurm.sh
+++ b/benchmarks/dsr1_fp4_mi355x_slurm.sh
@@ -54,3 +54,140 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
 
+
+#######
+
+#
+## Evals setup 
+# !TODO clean env vars
+EVAL_RESULT_DIR=${EVAL_RESULT_DIR:-eval_out}
+OPENAI_SERVER_BASE="http://0.0.0.0:${PORT}"
+OPENAI_COMP_BASE="$OPENAI_SERVER_BASE/v1/completions"
+OPENAI_CHAT_BASE="$OPENAI_SERVER_BASE/v1/chat/completions"
+export OPENAI_API_KEY=${OPENAI_API_KEY:-EMPTY}
+
+# Patch to convert bypass regex error if content field is empty
+PATCH_DIR="$(mktemp -d)"
+cat > "$PATCH_DIR/sitecustomize.py" <<'PY'
+import re, sys, unicodedata
+from lm_eval.filters import extraction as ex
+
+def _s(x):  # coerce to str
+    return x if isinstance(x, str) else ""
+
+# --- Patch RegexFilter.apply (used by many datasets) ---
+_orig_regex_apply = ex.RegexFilter.apply
+def _safe_regex_apply(self, resps, docs):
+    out = []
+    for inst in resps:  # inst is a list of candidate responses for one doc
+        filtered = []
+        for resp in inst:
+            txt = _s(resp)
+            m = self.regex.findall(txt)
+            if m:
+                m = m[self.group_select]
+                if isinstance(m, tuple):
+                    m = [t for t in m if t]
+                    m = m[0] if m else self.fallback
+                m = m.strip()
+            else:
+                m = self.fallback
+            filtered.append(m)
+        out.append(filtered)
+    return out
+ex.RegexFilter.apply = _safe_regex_apply
+
+# --- Patch MultiChoiceRegexFilter.apply (used by GSM8K flexible-extract) ---
+_orig_mc_apply = ex.MultiChoiceRegexFilter.apply
+def _safe_mc_apply(self, resps, docs):
+    def find_match(regex, resp, convert_dict={}):
+        txt = _s(resp)
+        match = regex.findall(txt)
+        if match:
+            match = match[self.group_select]
+            if isinstance(match, tuple):
+                match = [m for m in match if m]
+                if match:
+                    match = match[0]
+        if match:
+            match = match.strip()
+            if match in convert_dict:
+                return convert_dict[match]
+            return match
+        return None
+
+    punct_tbl = dict.fromkeys(
+        i for i in range(sys.maxunicode)
+        if unicodedata.category(chr(i)).startswith("P")
+    )
+
+    def filter_ignores(st):
+        st = _s(st)
+        if self.regexes_to_ignore is not None:
+            for s in self.regexes_to_ignore:
+                st = re.sub(s, "", st)
+        if self.ignore_case:
+            st = st.lower()
+        if self.ignore_punctuation:
+            st = st.translate(punct_tbl)
+        return st
+
+    out = []
+    for r, doc in zip(resps, docs):
+        # Build fallback regexes from choices (A, B, C, ...) as in upstream
+        fallback_regexes, choice_to_alpha = [], {}
+        next_alpha = "A"
+        without_paren, without_paren_to_target = [], {}
+        for c in doc.get("choices", []):
+            m = filter_ignores(c.strip())
+            fallback_regexes.append(re.escape(m))
+            choice_to_alpha[m] = f"({next_alpha})"
+            without_paren.append(next_alpha)
+            without_paren_to_target[next_alpha] = f"({next_alpha})"
+            next_alpha = chr(ord(next_alpha) + 1)
+
+        fallback_regex = re.compile("|".join(fallback_regexes)) if fallback_regexes else None
+        without_paren_regex = re.compile(rf":[\s]*({'|'.join(without_paren)})") if without_paren else None
+
+        filtered = []
+        for resp in r:
+            m = find_match(self.regex, resp)
+            if not m and fallback_regex:
+                m = find_match(fallback_regex, filter_ignores(resp), choice_to_alpha)
+            if not m and without_paren_regex:
+                m = find_match(without_paren_regex, resp, without_paren_to_target)
+            if not m:
+                m = self.fallback
+            filtered.append(m)
+        out.append(filtered)
+    return out
+
+ex.MultiChoiceRegexFilter.apply = _safe_mc_apply
+PY
+
+export PYTHONPATH="${PATCH_DIR}:${PYTHONPATH:-}"
+set -x
+python3 -m lm_eval --model local-chat-completions --apply_chat_template \
+--tasks ${EVAL_TASK:-gsm8k} \
+--num_fewshot ${NUM_FEWSHOT:-5} \
+--batch_size 2 \
+--output_path "/workspace/${EVAL_RESULT_DIR}" \
+--model_args "model=$MODEL,base_url=$OPENAI_CHAT_BASE,api_key=$OPENAI_API_KEY,eos_string=</s>,max_retries=3,num_concurrent=32,tokenized_requests=False" \
+--gen_kwargs "max_tokens=8192,temperature=0,top_p=1"
+set +x
+
+# Append a Markdown table to the GitHub Actions job summary using helper in bench_serving
+if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
+python3 bench_serving/lm_eval_to_md.py \
+    --results-dir "/workspace/${EVAL_RESULT_DIR}" \
+    --task "${EVAL_TASK:-gsm8k}" \
+    --framework "${FRAMEWORK}" \
+    --precision "${PRECISION}" \
+    --tp "${TP:-1}" \
+    --ep "${EP_SIZE:-1}" \
+    --dp-attention "${DP_ATTENTION:-false}" \
+    >> "$GITHUB_STEP_SUMMARY" || true
+fi
+
+echo "Evaluation completed. Results in /workspace/${EVAL_RESULT_DIR}"
+exit 0
diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh
index 9cf7c5275..e8f7bdd1d 100644
--- a/benchmarks/gptoss_fp4_h100_docker.sh
+++ b/benchmarks/gptoss_fp4_h100_docker.sh
@@ -1,9 +1,11 @@
 #!/usr/bin/env bash
 
-# === Required Env Vars === 
+# === Required Env Vars ===
 # HF_TOKEN
 # HF_HUB_CACHE
 # MODEL
+# ISL
+# OSL
 # MAX_MODEL_LEN
 # RANDOM_RANGE_RATIO
 # TP
@@ -12,6 +14,7 @@
 # OSL
 
 
+# Create a basic vLLM config
 cat > config.yaml << EOF
 compilation-config: '{"cudagraph_mode":"PIECEWISE"}'
 async-scheduling: true
@@ -24,6 +27,7 @@ EOF
 export PYTHONNOUSERSITE=1
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 
+# Start server in the background, shld be openai/gpt-oss-120b
 set -x
 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
 --config config.yaml \
@@ -52,4 +56,4 @@ run_benchmark_serving \
     --num-prompts $(( $CONC * 10 )) \
     --max-concurrency 512 \
     --result-filename "$RESULT_FILENAME" \
-    --result-dir /workspace/
\ No newline at end of file
+    --result-dir /workspace/
diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh
index 843219b95..82fad1dd7 100644
--- a/benchmarks/gptoss_fp4_h100_slurm.sh
+++ b/benchmarks/gptoss_fp4_h100_slurm.sh
@@ -26,14 +26,16 @@ EOF
 
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 export TORCH_CUDA_ARCH_LIST="9.0"
+PORT=${PORT:-8888}
 
 set -x
 PYTHONNOUSERSITE=1 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
---config config.yaml \
---gpu-memory-utilization=0.9 \
---tensor-parallel-size=$TP \
---max-num-seqs=$CONC  \
---disable-log-requests > $SERVER_LOG 2>&1 &
+  --config config.yaml \
+  --gpu-memory-utilization=0.9 \
+  --tensor-parallel-size=$TP \
+  --max-num-seqs=$CONC  \
+  --disable-log-requests \
+  > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh
index dc29baf8d..f87361ffd 100644
--- a/benchmarks/gptoss_fp4_h200_slurm.sh
+++ b/benchmarks/gptoss_fp4_h200_slurm.sh
@@ -16,7 +16,6 @@
 
 echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 
-set -x
 hf download $MODEL
 pip install datasets pandas
 
@@ -40,6 +39,7 @@ max-model-len: $CALCULATED_MAX_MODEL_LEN
 EOF
 
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+export TORCH_CUDA_ARCH_LIST="9.0"
 PORT=$(( 8888 + $PORT_OFFSET ))
 
 export TORCH_CUDA_ARCH_LIST="9.0"
diff --git a/benchmarks/gptoss_fp4_mi300x_slurm.sh b/benchmarks/gptoss_fp4_mi300x_slurm.sh
index a9e164cc2..eadfa16b3 100644
--- a/benchmarks/gptoss_fp4_mi300x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi300x_slurm.sh
@@ -35,6 +35,8 @@ export VLLM_ROCM_USE_AITER_MHA=0
 export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0
 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
 
+#
+## Start up vllm server
 set -x
 vllm serve $MODEL --port $PORT \
 --tensor-parallel-size=$TP \
@@ -67,3 +69,183 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+set +x
+
+#
+## Ensure benching scripts present
+git config --global --add safe.directory /workspace || true
+if [[ ! -d bench_serving ]]; then
+    git clone https://github.com/kimbochen/bench_serving.git
+fi
+
+#
+## Deps for lm-eval
+#python3 -m pip install -q --upgrade pip || true
+python3 -m pip install -q --no-cache-dir "lm-eval[api]" || true
+# Temporary: workaround known harness issue
+python3 -m pip install -q --no-cache-dir --no-deps "git+https://github.com/EleutherAI/lm-evaluation-harness.git@main" || true
+
+#
+## Wait for vllm server to start up
+while IFS= read -r line; do
+    printf '%s\n' "$line"
+    if [[ "$line" == *"Application startup complete"* ]]; then
+        break
+    fi
+done < <(tail -F -n0 "$SERVER_LOG")
+
+#
+## Run benchmark
+set -x
+python3 bench_serving/benchmark_serving.py \
+--model $MODEL \ 
+--backend vllm \
+--base-url http://0.0.0.0:$PORT \
+--dataset-name random \
+--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
+--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
+--request-rate inf --ignore-eos \
+--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
+--result-dir /workspace/ \
+--result-filename $RESULT_FILENAME.json
+set +x
+
+#######
+
+#
+## Evals setup 
+# !TODO clean env vars
+EVAL_RESULT_DIR=${EVAL_RESULT_DIR:-eval_out}
+OPENAI_SERVER_BASE="http://0.0.0.0:${PORT}"
+OPENAI_COMP_BASE="$OPENAI_SERVER_BASE/v1/completions"
+OPENAI_CHAT_BASE="$OPENAI_SERVER_BASE/v1/chat/completions"
+export OPENAI_API_KEY=${OPENAI_API_KEY:-EMPTY}
+
+# Patch to convert bypass regex error if content field is empty
+PATCH_DIR="$(mktemp -d)"
+cat > "$PATCH_DIR/sitecustomize.py" <<'PY'
+import re, sys, unicodedata
+from lm_eval.filters import extraction as ex
+
+def _s(x):  # coerce to str
+    return x if isinstance(x, str) else ""
+
+# --- Patch RegexFilter.apply (used by many datasets) ---
+_orig_regex_apply = ex.RegexFilter.apply
+def _safe_regex_apply(self, resps, docs):
+    out = []
+    for inst in resps:  # inst is a list of candidate responses for one doc
+        filtered = []
+        for resp in inst:
+            txt = _s(resp)
+            m = self.regex.findall(txt)
+            if m:
+                m = m[self.group_select]
+                if isinstance(m, tuple):
+                    m = [t for t in m if t]
+                    m = m[0] if m else self.fallback
+                m = m.strip()
+            else:
+                m = self.fallback
+            filtered.append(m)
+        out.append(filtered)
+    return out
+ex.RegexFilter.apply = _safe_regex_apply
+
+# --- Patch MultiChoiceRegexFilter.apply (used by GSM8K flexible-extract) ---
+_orig_mc_apply = ex.MultiChoiceRegexFilter.apply
+def _safe_mc_apply(self, resps, docs):
+    def find_match(regex, resp, convert_dict={}):
+        txt = _s(resp)
+        match = regex.findall(txt)
+        if match:
+            match = match[self.group_select]
+            if isinstance(match, tuple):
+                match = [m for m in match if m]
+                if match:
+                    match = match[0]
+        if match:
+            match = match.strip()
+            if match in convert_dict:
+                return convert_dict[match]
+            return match
+        return None
+
+    punct_tbl = dict.fromkeys(
+        i for i in range(sys.maxunicode)
+        if unicodedata.category(chr(i)).startswith("P")
+    )
+
+    def filter_ignores(st):
+        st = _s(st)
+        if self.regexes_to_ignore is not None:
+            for s in self.regexes_to_ignore:
+                st = re.sub(s, "", st)
+        if self.ignore_case:
+            st = st.lower()
+        if self.ignore_punctuation:
+            st = st.translate(punct_tbl)
+        return st
+
+    out = []
+    for r, doc in zip(resps, docs):
+        # Build fallback regexes from choices (A, B, C, ...) as in upstream
+        fallback_regexes, choice_to_alpha = [], {}
+        next_alpha = "A"
+        without_paren, without_paren_to_target = [], {}
+        for c in doc.get("choices", []):
+            m = filter_ignores(c.strip())
+            fallback_regexes.append(re.escape(m))
+            choice_to_alpha[m] = f"({next_alpha})"
+            without_paren.append(next_alpha)
+            without_paren_to_target[next_alpha] = f"({next_alpha})"
+            next_alpha = chr(ord(next_alpha) + 1)
+
+        fallback_regex = re.compile("|".join(fallback_regexes)) if fallback_regexes else None
+        without_paren_regex = re.compile(rf":[\s]*({'|'.join(without_paren)})") if without_paren else None
+
+        filtered = []
+        for resp in r:
+            m = find_match(self.regex, resp)
+            if not m and fallback_regex:
+                m = find_match(fallback_regex, filter_ignores(resp), choice_to_alpha)
+            if not m and without_paren_regex:
+                m = find_match(without_paren_regex, resp, without_paren_to_target)
+            if not m:
+                m = self.fallback
+            filtered.append(m)
+        out.append(filtered)
+    return out
+
+ex.MultiChoiceRegexFilter.apply = _safe_mc_apply
+PY
+
+export PYTHONPATH="${PATCH_DIR}:${PYTHONPATH:-}"
+set -x
+python3 -m lm_eval --model local-chat-completions --apply_chat_template \
+--tasks ${EVAL_TASK:-gsm8k} \
+--num_fewshot ${NUM_FEWSHOT:-5} \
+--batch_size 2 \
+--output_path "/workspace/${EVAL_RESULT_DIR}" \
+--model_args "model=$MODEL,base_url=$OPENAI_CHAT_BASE,api_key=$OPENAI_API_KEY,eos_string=</s>,max_retries=3,num_concurrent=32,tokenized_requests=False" \
+--gen_kwargs "max_tokens=8192,temperature=0,top_p=1"
+set +x
+
+# Append a Markdown table to the GitHub Actions job summary using helper in bench_serving
+if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
+python3 bench_serving/lm_eval_to_md.py \
+    --results-dir "/workspace/${EVAL_RESULT_DIR}" \
+    --task "${EVAL_TASK:-gsm8k}" \
+    --framework "${FRAMEWORK}" \
+    --precision "${PRECISION}" \
+    --tp "${TP:-1}" \
+    --ep "${EP_SIZE:-1}" \
+    --dp-attention "${DP_ATTENTION:-false}" \
+    >> "$GITHUB_STEP_SUMMARY" || true
+fi
+
+echo "Evaluation completed. Results in /workspace/${EVAL_RESULT_DIR}"
+exit 0
+
+
+
diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh
index f15e6261c..d593eb361 100644
--- a/benchmarks/gptoss_fp4_mi325x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh
@@ -13,7 +13,6 @@
 # CONC
 # RESULT_FILENAME
 
-
 echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 
 hf download $MODEL
@@ -35,6 +34,8 @@ export VLLM_USE_AITER_UNIFIED_ATTENTION=1
 export VLLM_ROCM_USE_AITER_MHA=0
 export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0
 
+#
+## Start up vllm server
 set -x
 vllm serve $MODEL --port $PORT \
 --tensor-parallel-size=$TP \
diff --git a/benchmarks/gptoss_fp4_mi355x_slurm.sh b/benchmarks/gptoss_fp4_mi355x_slurm.sh
index a2adf2952..d21720add 100644
--- a/benchmarks/gptoss_fp4_mi355x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi355x_slurm.sh
@@ -26,6 +26,8 @@ export VLLM_USE_AITER_UNIFIED_ATTENTION=1
 export VLLM_ROCM_USE_AITER_MHA=0
 export VLLM_ROCM_USE_AITER_FUSED_MOE_A16W4=1
 
+#
+## Start up vllm server
 set -x
 vllm serve $MODEL --port $PORT \
 --tensor-parallel-size=$TP \

From 173d7bf46f1b83ba94eba2794e34fbb1fe3dea63 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 15 Nov 2025 08:01:06 -0600
Subject: [PATCH 050/214] evals h100-cr

---
 .../{eval-h100-gms8k.yml => eval-gms8k.yml}   |  16 +-
 benchmarks/benchmark_lib.sh                   | 210 ++++++++++++++++++
 benchmarks/gptoss_fp4_h100_slurm.sh           |   4 +
 benchmarks/gptoss_fp4_mi300x_slurm.sh         |  24 --
 4 files changed, 222 insertions(+), 32 deletions(-)
 rename .github/workflows/{eval-h100-gms8k.yml => eval-gms8k.yml} (76%)

diff --git a/.github/workflows/eval-h100-gms8k.yml b/.github/workflows/eval-gms8k.yml
similarity index 76%
rename from .github/workflows/eval-h100-gms8k.yml
rename to .github/workflows/eval-gms8k.yml
index 84723f596..1822ba209 100644
--- a/.github/workflows/eval-h100-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -35,21 +35,21 @@ on:
         default: "200"
   push:
     paths:
-      - '.github/workflows/eval-h100-gms8k.yml'
+      - '.github/workflows/eval-gms8k.yml'
       - '.github/workflows/eval-tmpl.yml'
-      - 'benchmarks/dsr1_fp8_mi325x_slurm.sh'
+      - 'benchmarks/gptoss_fp4_h100_slurm.sh'
 
 jobs:
   eval:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: mi325x-tw_1
-      image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' }}
-      model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }}
-      framework: sglang
-      precision: fp8
-      exp-name: dsr1_gsm8k_poc
+      runner: h100-cr_1
+      image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
+      model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
+      framework: vllm
+      precision: fp4
+      exp-name: gptoss_gsm8k_poc
       tp: ${{ inputs.tp || '4' }}
       ep: '1'
       dp-attn: false
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index cc3448d40..7c361c649 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -212,3 +212,213 @@ run_benchmark_serving() {
         --result-filename "$result_filename.json"
     set +x
 }
+
+
+# ------------------------------
+# Eval (lm-eval-harness) helpers
+# ------------------------------
+
+# Ensure bench_serving repo is available for helper utilities (e.g., md summary)
+_ensure_bench_serving_repo() {
+    set +x
+    git config --global --add safe.directory /workspace || true
+    if [[ ! -d bench_serving ]]; then
+        git clone https://github.com/kimbochen/bench_serving.git || true
+    fi
+}
+
+# Install or update lm-eval dependencies
+_install_lm_eval_deps() {
+    set +x
+    python3 -m pip install -q --no-cache-dir "lm-eval[api]" || true
+    # Temporary: workaround known harness issue by using main
+    python3 -m pip install -q --no-cache-dir --no-deps \
+        "git+https://github.com/EleutherAI/lm-evaluation-harness.git@main" || true
+}
+
+# Patch lm-eval filters to be robust to empty strings via sitecustomize
+_patch_lm_eval_filters() {
+    set +x
+    local patch_dir
+    patch_dir="$(mktemp -d)"
+    cat > "$patch_dir/sitecustomize.py" <<'PY'
+import re, sys, unicodedata
+from lm_eval.filters import extraction as ex
+
+def _s(x):  # coerce to str
+    return x if isinstance(x, str) else ""
+
+# --- Patch RegexFilter.apply (used by many datasets) ---
+_orig_regex_apply = ex.RegexFilter.apply
+def _safe_regex_apply(self, resps, docs):
+    out = []
+    for inst in resps:  # inst is a list of candidate responses for one doc
+        filtered = []
+        for resp in inst:
+            txt = _s(resp)
+            m = self.regex.findall(txt)
+            if m:
+                m = m[self.group_select]
+                if isinstance(m, tuple):
+                    m = [t for t in m if t]
+                    m = m[0] if m else self.fallback
+                m = m.strip()
+            else:
+                m = self.fallback
+            filtered.append(m)
+        out.append(filtered)
+    return out
+ex.RegexFilter.apply = _safe_regex_apply
+
+# --- Patch MultiChoiceRegexFilter.apply (used by GSM8K flexible-extract) ---
+_orig_mc_apply = ex.MultiChoiceRegexFilter.apply
+def _safe_mc_apply(self, resps, docs):
+    def find_match(regex, resp, convert_dict={}):
+        txt = _s(resp)
+        match = regex.findall(txt)
+        if match:
+            match = match[self.group_select]
+            if isinstance(match, tuple):
+                match = [m for m in match if m]
+                if match:
+                    match = match[0]
+        if match:
+            match = match.strip()
+            if match in convert_dict:
+                return convert_dict[match]
+            return match
+        return None
+
+    punct_tbl = dict.fromkeys(
+        i for i in range(sys.maxunicode)
+        if unicodedata.category(chr(i)).startswith("P")
+    )
+
+    def filter_ignores(st):
+        st = _s(st)
+        if self.regexes_to_ignore is not None:
+            for s in self.regexes_to_ignore:
+                st = re.sub(s, "", st)
+        if self.ignore_case:
+            st = st.lower()
+        if self.ignore_punctuation:
+            st = st.translate(punct_tbl)
+        return st
+
+    out = []
+    for r, doc in zip(resps, docs):
+        # Build fallback regexes from choices (A, B, C, ...) as in upstream
+        fallback_regexes, choice_to_alpha = [], {}
+        next_alpha = "A"
+        without_paren, without_paren_to_target = [], {}
+        for c in doc.get("choices", []):
+            m = filter_ignores(c.strip())
+            fallback_regexes.append(re.escape(m))
+            choice_to_alpha[m] = f"({next_alpha})"
+            without_paren.append(next_alpha)
+            without_paren_to_target[next_alpha] = f"({next_alpha})"
+            next_alpha = chr(ord(next_alpha) + 1)
+
+        fallback_regex = re.compile("|".join(fallback_regexes)) if fallback_regexes else None
+        without_paren_regex = re.compile(rf":[\s]*({'|'.join(without_paren)})") if without_paren else None
+
+        filtered = []
+        for resp in r:
+            m = find_match(self.regex, resp)
+            if not m and fallback_regex:
+                m = find_match(fallback_regex, filter_ignores(resp), choice_to_alpha)
+            if not m and without_paren_regex:
+                m = find_match(without_paren_regex, resp, without_paren_to_target)
+            if not m:
+                m = self.fallback
+            filtered.append(m)
+        out.append(filtered)
+    return out
+
+ex.MultiChoiceRegexFilter.apply = _safe_mc_apply
+PY
+    export PYTHONPATH="${patch_dir}:${PYTHONPATH:-}"
+}
+
+# Run an lm-eval-harness task against a local OpenAI-compatible server
+# Parameters:
+#   --port:              Server port (default: $PORT or 8888)
+#   --task:              Eval task (default: $EVAL_TASK or gsm8k)
+#   --num-fewshot:       Fewshot k (default: $NUM_FEWSHOT or 5)
+#   --results-dir:       Output dir (default: $EVAL_RESULT_DIR or eval_out)
+#   --batch-size:        Harness batch size (default: 2)
+#   --gen-max-tokens:    Max tokens for generation (default: 8192)
+#   --temperature:       Temperature (default: 0)
+#   --top-p:             Top-p (default: 1)
+run_lm_eval() {
+    set +x
+    local port="${PORT:-8888}"
+    local task="${EVAL_TASK:-gsm8k}"
+    local num_fewshot="${NUM_FEWSHOT:-5}"
+    local results_dir="${EVAL_RESULT_DIR:-eval_out}"
+    local batch_size=2
+    local gen_max_tokens=8192
+    local temperature=0
+    local top_p=1
+
+    # Parse arguments
+    while [[ $# -gt 0 ]]; do
+        case $1 in
+            --port)
+                port="$2"; shift 2;;
+            --task)
+                task="$2"; shift 2;;
+            --num-fewshot)
+                num_fewshot="$2"; shift 2;;
+            --results-dir)
+                results_dir="$2"; shift 2;;
+            --batch-size)
+                batch_size="$2"; shift 2;;
+            --gen-max-tokens)
+                gen_max_tokens="$2"; shift 2;;
+            --temperature)
+                temperature="$2"; shift 2;;
+            --top-p)
+                top_p="$2"; shift 2;;
+            *)
+                echo "Unknown parameter: $1"; return 1;;
+        esac
+    done
+
+    _ensure_bench_serving_repo
+    _install_lm_eval_deps
+    _patch_lm_eval_filters
+
+    local openai_server_base="http://0.0.0.0:${port}"
+    local openai_chat_base="$openai_server_base/v1/chat/completions"
+    export OPENAI_API_KEY=${OPENAI_API_KEY:-EMPTY}
+
+    set -x
+    python3 -m lm_eval --model local-chat-completions --apply_chat_template \
+      --tasks "${task}" \
+      --num_fewshot "${num_fewshot}" \
+      --batch_size "${batch_size}" \
+      --output_path "/workspace/${results_dir}" \
+      --model_args "model=${MODEL},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=</s>,max_retries=3,num_concurrent=32,tokenized_requests=False" \
+      --gen_kwargs "max_tokens=${gen_max_tokens},temperature=${temperature},top_p=${top_p}"
+    set +x
+}
+
+# Append a Markdown summary to GitHub step summary (no-op if not in GH Actions)
+append_lm_eval_summary() {
+    set +x
+    local results_dir="${EVAL_RESULT_DIR:-eval_out}"
+    local task="${EVAL_TASK:-gsm8k}"
+    if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
+        _ensure_bench_serving_repo
+        python3 bench_serving/lm_eval_to_md.py \
+            --results-dir "/workspace/${results_dir}" \
+            --task "${task}" \
+            --framework "${FRAMEWORK}" \
+            --precision "${PRECISION}" \
+            --tp "${TP:-1}" \
+            --ep "${EP_SIZE:-1}" \
+            --dp-attention "${DP_ATTENTION:-false}" \
+            >> "$GITHUB_STEP_SUMMARY" || true
+    fi
+}
diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh
index 82fad1dd7..b463a8aaf 100644
--- a/benchmarks/gptoss_fp4_h100_slurm.sh
+++ b/benchmarks/gptoss_fp4_h100_slurm.sh
@@ -58,3 +58,7 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation (defaults to GSM8K)
+run_lm_eval --port "$PORT"
+append_lm_eval_summary
diff --git a/benchmarks/gptoss_fp4_mi300x_slurm.sh b/benchmarks/gptoss_fp4_mi300x_slurm.sh
index eadfa16b3..eaacf6b1b 100644
--- a/benchmarks/gptoss_fp4_mi300x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi300x_slurm.sh
@@ -85,30 +85,6 @@ python3 -m pip install -q --no-cache-dir "lm-eval[api]" || true
 # Temporary: workaround known harness issue
 python3 -m pip install -q --no-cache-dir --no-deps "git+https://github.com/EleutherAI/lm-evaluation-harness.git@main" || true
 
-#
-## Wait for vllm server to start up
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
-
-#
-## Run benchmark
-set -x
-python3 bench_serving/benchmark_serving.py \
---model $MODEL \ 
---backend vllm \
---base-url http://0.0.0.0:$PORT \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json
-set +x
 
 #######
 

From 4ff8a9b2b8ad0b7651e4c68ab67ee8323319c687 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 15 Nov 2025 08:21:41 -0600
Subject: [PATCH 051/214] evals h100-cw

---
 .github/workflows/eval-gms8k.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 1822ba209..92919e356 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -44,7 +44,7 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: h100-cr_1
+      runner: h100-cw_1
       image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm

From 83901e7427bbd4c6cc46de9e26eb80dbdcff9fd0 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 15 Nov 2025 14:02:45 -0600
Subject: [PATCH 052/214] evals h200-nb

---
 .github/workflows/eval-gms8k.yml    | 4 ++--
 .github/workflows/eval-tmpl.yml     | 2 +-
 benchmarks/gptoss_fp4_h200_slurm.sh | 4 ++++
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 92919e356..752c92ce2 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -37,14 +37,14 @@ on:
     paths:
       - '.github/workflows/eval-gms8k.yml'
       - '.github/workflows/eval-tmpl.yml'
-      - 'benchmarks/gptoss_fp4_h100_slurm.sh'
+      - 'benchmarks/gptoss_fp4_h200_slurm.sh'
 
 jobs:
   eval:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: h100-cw_1
+      runner: h200-nb_1
       image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm
diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml
index 9c4e77c78..7b42a7853 100644
--- a/.github/workflows/eval-tmpl.yml
+++ b/.github/workflows/eval-tmpl.yml
@@ -65,7 +65,7 @@ env:
   LIMIT: ${{ inputs.limit }}
   EVAL_RESULT_DIR: eval_out
   # Server-side concurrency default (used by some server scripts)
-  CONC: '32'
+  CONC: '16'
   MAX_MODEL_LEN: '8192'
   ISL: 1024
   OSL: 8192
diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh
index f87361ffd..2c18d4d6a 100644
--- a/benchmarks/gptoss_fp4_h200_slurm.sh
+++ b/benchmarks/gptoss_fp4_h200_slurm.sh
@@ -67,3 +67,7 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation (defaults to GSM8K)
+run_lm_eval --port "$PORT"
+append_lm_eval_summary
\ No newline at end of file

From 6c65a24e646efd17dd0a16fc11c2616721a7913e Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 15 Nov 2025 14:26:20 -0600
Subject: [PATCH 053/214] move eval script here

---
 .github/workflows/eval-gms8k.yml |   1 +
 benchmarks/benchmark_lib.sh      |  15 +---
 utils/lm_eval_to_md.py           | 137 +++++++++++++++++++++++++++++++
 3 files changed, 140 insertions(+), 13 deletions(-)
 create mode 100644 utils/lm_eval_to_md.py

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 752c92ce2..23e2be4bb 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -38,6 +38,7 @@ on:
       - '.github/workflows/eval-gms8k.yml'
       - '.github/workflows/eval-tmpl.yml'
       - 'benchmarks/gptoss_fp4_h200_slurm.sh'
+      - 'benchmarks/benchmark_lib.sh'
 
 jobs:
   eval:
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 7c361c649..3a9df22bc 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -218,15 +218,6 @@ run_benchmark_serving() {
 # Eval (lm-eval-harness) helpers
 # ------------------------------
 
-# Ensure bench_serving repo is available for helper utilities (e.g., md summary)
-_ensure_bench_serving_repo() {
-    set +x
-    git config --global --add safe.directory /workspace || true
-    if [[ ! -d bench_serving ]]; then
-        git clone https://github.com/kimbochen/bench_serving.git || true
-    fi
-}
-
 # Install or update lm-eval dependencies
 _install_lm_eval_deps() {
     set +x
@@ -384,8 +375,7 @@ run_lm_eval() {
                 echo "Unknown parameter: $1"; return 1;;
         esac
     done
-
-    _ensure_bench_serving_repo
+ 
     _install_lm_eval_deps
     _patch_lm_eval_filters
 
@@ -410,8 +400,7 @@ append_lm_eval_summary() {
     local results_dir="${EVAL_RESULT_DIR:-eval_out}"
     local task="${EVAL_TASK:-gsm8k}"
     if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
-        _ensure_bench_serving_repo
-        python3 bench_serving/lm_eval_to_md.py \
+        python3 utils/lm_eval_to_md.py \
             --results-dir "/workspace/${results_dir}" \
             --task "${task}" \
             --framework "${FRAMEWORK}" \
diff --git a/utils/lm_eval_to_md.py b/utils/lm_eval_to_md.py
new file mode 100644
index 000000000..dbcc4d88d
--- /dev/null
+++ b/utils/lm_eval_to_md.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+"""
+Convert latest lm-evaluation-harness JSON in a results dir into a Markdown table
+for GitHub Actions job summary. Prints to stdout.
+
+Usage:
+  python3 bench_serving/scripts/lm_eval_to_md.py \
+    --results-dir /workspace/eval_out \
+    --task gsm8k \
+    --framework vLLM \
+    --precision fp16 \
+    --tp 4 \
+    --ep 1 \
+    --dp-attention false
+"""
+import argparse, json, os, re, sys
+from collections import Counter
+from glob import glob
+
+def find_latest_json(results_dir: str):
+    paths = []
+    for root, _, _ in os.walk(results_dir):
+        paths.extend(glob(os.path.join(root, "*.json")))
+    if not paths:
+        return None
+    paths.sort(key=lambda p: os.path.getmtime(p), reverse=True)
+    return paths[0]
+
+def pct(x):
+    return f"{x*100:.2f}%" if isinstance(x, (int, float)) else "N/A"
+
+def se(x):
+    return f" \u00B1{(x*100):.2f}%" if isinstance(x, (int, float)) else ""
+
+def gpu_cpu_from_pretty_env(pe: str):
+    if not isinstance(pe, str) or not pe:
+        return "Unknown GPU"
+    gpu_lines = [l for l in pe.splitlines() if l.startswith("GPU ")]
+    names = [re.sub(r"GPU \d+:\s*", "", l).strip() for l in gpu_lines]
+    c = Counter(names)
+    gpu_summary = " + ".join([f"{n}\u00D7 {name}" for name, n in c.items()]) if c else "Unknown GPU"
+    cpu_line = next((l.split(":", 1)[1].strip() for l in pe.splitlines() if l.startswith("Model name:")), None)
+    return gpu_summary + (f" ({cpu_line})" if cpu_line else "")
+
+def extract_metrics(data: dict, task: str):
+    # results section can vary across harness versions
+    res_all = data.get("results", {}) or {}
+    res = res_all.get(task) if isinstance(res_all, dict) else {}
+    if not res and isinstance(res_all, dict) and res_all:
+        # fallback to first key if requested task missing
+        any_key = next(iter(res_all.keys()))
+        res = res_all.get(any_key, {})
+        task = any_key
+
+    strict = res.get("exact_match,strict-match")
+    flex   = res.get("exact_match,flexible-extract")
+    strict_se = res.get("exact_match_stderr,strict-match")
+    flex_se   = res.get("exact_match_stderr,flexible-extract")
+
+    n_eff = None
+    ns = data.get("n-samples") or data.get("n_samples") or {}
+    if isinstance(ns, dict):
+        tdict = ns.get(task) or ns.get("gsm8k") or {}
+        if isinstance(tdict, dict):
+            n_eff = tdict.get("effective") or tdict.get("n_eff")
+
+    # model/fewshot/limit are scattered depending on version
+    model = data.get("model_name") \
+        or data.get("configs", {}).get(task, {}).get("metadata", {}).get("model") \
+        or data.get("config", {}).get("model") \
+        or ""
+
+    # k-shot
+    fewshot = None
+    nshot = data.get("n-shot") or data.get("n_shot") or {}
+    if isinstance(nshot, dict):
+        fewshot = nshot.get(task) or nshot.get("gsm8k")
+
+    # limit
+    limit = None
+    cfg = data.get("config") or {}
+    if isinstance(cfg, dict):
+        limit = cfg.get("limit")
+
+    return {
+        "task": task,
+        "strict": strict,
+        "flex": flex,
+        "strict_se": strict_se,
+        "flex_se": flex_se,
+        "n_eff": n_eff,
+        "model": model,
+        "fewshot": fewshot,
+        "limit": limit
+    }
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--results-dir", required=True)
+    ap.add_argument("--task", default="gsm8k")
+    ap.add_argument("--framework", default=os.environ.get("FRAMEWORK", "vLLM"))
+    ap.add_argument("--precision", default=os.environ.get("PRECISION", "fp16"))
+    ap.add_argument("--tp", default=os.environ.get("TP", "1"))
+    ap.add_argument("--ep", default=os.environ.get("EP_SIZE", "1"))
+    ap.add_argument("--dp-attention", default=os.environ.get("DP_ATTENTION", "false"))
+    args = ap.parse_args()
+
+    path = find_latest_json(args.results_dir)
+    print(f"### {args.task} Evaluation\n")
+    if not path or not os.path.exists(path):
+        print(f"> No result JSON found in `{args.results_dir}`.")
+        return
+
+    with open(path, "r") as f:
+        data = json.load(f)
+
+    hardware = gpu_cpu_from_pretty_env(data.get("pretty_env_info", ""))
+    m = extract_metrics(data, args.task)
+
+    print("| Hardware | Framework | Precision | TP | EP | DP Attention | EM Strict | EM Flexible | N (eff) |")
+    print("|---|---|---:|--:|--:|:--:|--:|--:|--:|")
+    print(f"| {hardware} | {args.framework} | {args.precision} | {args.tp} | {args.ep} | {str(args.dp_attention).lower()} | "
+          f"{pct(m['strict'])}{se(m['strict_se'])} | {pct(m['flex'])}{se(m['flex_se'])} | {m['n_eff'] or ''} |")
+
+    # metadata line
+    lim = m["limit"]
+    lim_str = str(int(lim)) if isinstance(lim, (int, float)) else (str(lim) if lim is not None else "")
+    fewshot = m["fewshot"] if m["fewshot"] is not None else ""
+    print(f"\n_Model_: `{m['model']}` &nbsp;&nbsp; _k-shot_: **{fewshot}** &nbsp;&nbsp; _limit_: **{lim_str}**  \n_Source_: `{os.path.basename(path)}`")
+
+if __name__ == "__main__":
+    try:
+        main()
+    except Exception as e:
+        # Never blow up the CI summary; emit a helpful line instead.
+        print(f"> Failed to render evaluation summary: {e}")
+        sys.exit(0)
\ No newline at end of file

From 343d24e045be0888a7e11ab624223d6c3d8c9771 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 15 Nov 2025 15:33:30 -0600
Subject: [PATCH 054/214] evals mi300x-amd

---
 .github/workflows/eval-gms8k.yml       |   6 +-
 benchmarks/gptoss_fp4_mi300x_docker.sh |   7 +-
 benchmarks/gptoss_fp4_mi300x_slurm.sh  | 159 +------------------------
 benchmarks/gptoss_fp4_mi325x_docker.sh |   5 +
 benchmarks/gptoss_fp4_mi325x_slurm.sh  |   5 +
 benchmarks/gptoss_fp4_mi355x_docker.sh |   5 +
 benchmarks/gptoss_fp4_mi355x_slurm.sh  |   5 +
 7 files changed, 32 insertions(+), 160 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 23e2be4bb..a814e0da3 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -37,16 +37,14 @@ on:
     paths:
       - '.github/workflows/eval-gms8k.yml'
       - '.github/workflows/eval-tmpl.yml'
-      - 'benchmarks/gptoss_fp4_h200_slurm.sh'
-      - 'benchmarks/benchmark_lib.sh'
 
 jobs:
   eval:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: h200-nb_1
-      image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
+      runner: mi300x-amd_0
+      image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm
       precision: fp4
diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh
index 7d1f98226..63dcf76e1 100644
--- a/benchmarks/gptoss_fp4_mi300x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi300x_docker.sh
@@ -56,4 +56,9 @@ run_benchmark_serving \
     --num-prompts $(( $CONC * 10 )) \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
-    --result-dir /workspace/
\ No newline at end of file
+    --result-dir /workspace/
+
+# After throughput, run evaluation (defaults to GSM8K)
+run_lm_eval --port "$PORT"
+append_lm_eval_summary
+set +x
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_mi300x_slurm.sh b/benchmarks/gptoss_fp4_mi300x_slurm.sh
index eaacf6b1b..b0ba7db04 100644
--- a/benchmarks/gptoss_fp4_mi300x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi300x_slurm.sh
@@ -69,159 +69,8 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
-set +x
-
-#
-## Ensure benching scripts present
-git config --global --add safe.directory /workspace || true
-if [[ ! -d bench_serving ]]; then
-    git clone https://github.com/kimbochen/bench_serving.git
-fi
-
-#
-## Deps for lm-eval
-#python3 -m pip install -q --upgrade pip || true
-python3 -m pip install -q --no-cache-dir "lm-eval[api]" || true
-# Temporary: workaround known harness issue
-python3 -m pip install -q --no-cache-dir --no-deps "git+https://github.com/EleutherAI/lm-evaluation-harness.git@main" || true
-
-
-#######
-
-#
-## Evals setup 
-# !TODO clean env vars
-EVAL_RESULT_DIR=${EVAL_RESULT_DIR:-eval_out}
-OPENAI_SERVER_BASE="http://0.0.0.0:${PORT}"
-OPENAI_COMP_BASE="$OPENAI_SERVER_BASE/v1/completions"
-OPENAI_CHAT_BASE="$OPENAI_SERVER_BASE/v1/chat/completions"
-export OPENAI_API_KEY=${OPENAI_API_KEY:-EMPTY}
-
-# Patch to convert bypass regex error if content field is empty
-PATCH_DIR="$(mktemp -d)"
-cat > "$PATCH_DIR/sitecustomize.py" <<'PY'
-import re, sys, unicodedata
-from lm_eval.filters import extraction as ex
-
-def _s(x):  # coerce to str
-    return x if isinstance(x, str) else ""
-
-# --- Patch RegexFilter.apply (used by many datasets) ---
-_orig_regex_apply = ex.RegexFilter.apply
-def _safe_regex_apply(self, resps, docs):
-    out = []
-    for inst in resps:  # inst is a list of candidate responses for one doc
-        filtered = []
-        for resp in inst:
-            txt = _s(resp)
-            m = self.regex.findall(txt)
-            if m:
-                m = m[self.group_select]
-                if isinstance(m, tuple):
-                    m = [t for t in m if t]
-                    m = m[0] if m else self.fallback
-                m = m.strip()
-            else:
-                m = self.fallback
-            filtered.append(m)
-        out.append(filtered)
-    return out
-ex.RegexFilter.apply = _safe_regex_apply
-
-# --- Patch MultiChoiceRegexFilter.apply (used by GSM8K flexible-extract) ---
-_orig_mc_apply = ex.MultiChoiceRegexFilter.apply
-def _safe_mc_apply(self, resps, docs):
-    def find_match(regex, resp, convert_dict={}):
-        txt = _s(resp)
-        match = regex.findall(txt)
-        if match:
-            match = match[self.group_select]
-            if isinstance(match, tuple):
-                match = [m for m in match if m]
-                if match:
-                    match = match[0]
-        if match:
-            match = match.strip()
-            if match in convert_dict:
-                return convert_dict[match]
-            return match
-        return None
-
-    punct_tbl = dict.fromkeys(
-        i for i in range(sys.maxunicode)
-        if unicodedata.category(chr(i)).startswith("P")
-    )
-
-    def filter_ignores(st):
-        st = _s(st)
-        if self.regexes_to_ignore is not None:
-            for s in self.regexes_to_ignore:
-                st = re.sub(s, "", st)
-        if self.ignore_case:
-            st = st.lower()
-        if self.ignore_punctuation:
-            st = st.translate(punct_tbl)
-        return st
-
-    out = []
-    for r, doc in zip(resps, docs):
-        # Build fallback regexes from choices (A, B, C, ...) as in upstream
-        fallback_regexes, choice_to_alpha = [], {}
-        next_alpha = "A"
-        without_paren, without_paren_to_target = [], {}
-        for c in doc.get("choices", []):
-            m = filter_ignores(c.strip())
-            fallback_regexes.append(re.escape(m))
-            choice_to_alpha[m] = f"({next_alpha})"
-            without_paren.append(next_alpha)
-            without_paren_to_target[next_alpha] = f"({next_alpha})"
-            next_alpha = chr(ord(next_alpha) + 1)
-
-        fallback_regex = re.compile("|".join(fallback_regexes)) if fallback_regexes else None
-        without_paren_regex = re.compile(rf":[\s]*({'|'.join(without_paren)})") if without_paren else None
-
-        filtered = []
-        for resp in r:
-            m = find_match(self.regex, resp)
-            if not m and fallback_regex:
-                m = find_match(fallback_regex, filter_ignores(resp), choice_to_alpha)
-            if not m and without_paren_regex:
-                m = find_match(without_paren_regex, resp, without_paren_to_target)
-            if not m:
-                m = self.fallback
-            filtered.append(m)
-        out.append(filtered)
-    return out
-
-ex.MultiChoiceRegexFilter.apply = _safe_mc_apply
-PY
-
-export PYTHONPATH="${PATCH_DIR}:${PYTHONPATH:-}"
-set -x
-python3 -m lm_eval --model local-chat-completions --apply_chat_template \
---tasks ${EVAL_TASK:-gsm8k} \
---num_fewshot ${NUM_FEWSHOT:-5} \
---batch_size 2 \
---output_path "/workspace/${EVAL_RESULT_DIR}" \
---model_args "model=$MODEL,base_url=$OPENAI_CHAT_BASE,api_key=$OPENAI_API_KEY,eos_string=</s>,max_retries=3,num_concurrent=32,tokenized_requests=False" \
---gen_kwargs "max_tokens=8192,temperature=0,top_p=1"
-set +x
-
-# Append a Markdown table to the GitHub Actions job summary using helper in bench_serving
-if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
-python3 bench_serving/lm_eval_to_md.py \
-    --results-dir "/workspace/${EVAL_RESULT_DIR}" \
-    --task "${EVAL_TASK:-gsm8k}" \
-    --framework "${FRAMEWORK}" \
-    --precision "${PRECISION}" \
-    --tp "${TP:-1}" \
-    --ep "${EP_SIZE:-1}" \
-    --dp-attention "${DP_ATTENTION:-false}" \
-    >> "$GITHUB_STEP_SUMMARY" || true
-fi
-
-echo "Evaluation completed. Results in /workspace/${EVAL_RESULT_DIR}"
-exit 0
-
-
 
+# After throughput, run evaluation (defaults to GSM8K)
+run_lm_eval --port "$PORT"
+append_lm_eval_summary
+set +x
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_mi325x_docker.sh b/benchmarks/gptoss_fp4_mi325x_docker.sh
index 46462ad6d..ccfe6e1c3 100644
--- a/benchmarks/gptoss_fp4_mi325x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi325x_docker.sh
@@ -56,3 +56,8 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation (defaults to GSM8K)
+run_lm_eval --port "$PORT"
+append_lm_eval_summary
+set +x
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh
index d593eb361..4219d0662 100644
--- a/benchmarks/gptoss_fp4_mi325x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh
@@ -68,3 +68,8 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation (defaults to GSM8K)
+run_lm_eval --port "$PORT"
+append_lm_eval_summary
+set +x
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh
index 0e54245d4..f63cc9960 100644
--- a/benchmarks/gptoss_fp4_mi355x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi355x_docker.sh
@@ -55,3 +55,8 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation (defaults to GSM8K)
+run_lm_eval --port "$PORT"
+append_lm_eval_summary
+set +x
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_mi355x_slurm.sh b/benchmarks/gptoss_fp4_mi355x_slurm.sh
index d21720add..0dd860bb1 100644
--- a/benchmarks/gptoss_fp4_mi355x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi355x_slurm.sh
@@ -59,3 +59,8 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation (defaults to GSM8K)
+run_lm_eval --port "$PORT"
+append_lm_eval_summary
+set +x
\ No newline at end of file

From 2de4a18a712acc85f1f2fe1c917684b03f0a61d2 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 15 Nov 2025 15:36:32 -0600
Subject: [PATCH 055/214] evals mi325x-amd

---
 .github/workflows/eval-gms8k.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index a814e0da3..3b0f005ae 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -43,7 +43,7 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: mi300x-amd_0
+      runner: mi325x-amd_0
       image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm

From 21825ce02174139a1647f925a86901db1607c5a7 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 15 Nov 2025 15:44:22 -0600
Subject: [PATCH 056/214] evals mi300x-tw

---
 .github/workflows/eval-gms8k.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 3b0f005ae..beaaa13dd 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -43,7 +43,7 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: mi325x-amd_0
+      runner: mi300x-tw_0
       image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm

From 00bfa341656c98f56287eade039a9ca126751d33 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 15 Nov 2025 15:46:28 -0600
Subject: [PATCH 057/214] evals mi300x-oci

---
 .github/workflows/eval-gms8k.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index beaaa13dd..c74b7fe8a 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -43,7 +43,7 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: mi300x-tw_0
+      runner: mi300x-oci_0
       image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm

From e8aa07eed7e6fddcbbaa2fb8ebd884c3dc81d150 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 15 Nov 2025 15:47:04 -0600
Subject: [PATCH 058/214] evals mi325x-tw

---
 .github/workflows/eval-gms8k.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index c74b7fe8a..2a0308f6f 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -43,7 +43,7 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: mi300x-oci_0
+      runner: mi325x-tw_0
       image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm

From bf4eff20534720e964bb64bc6a2cb1abc9b5e848 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 15 Nov 2025 16:14:16 -0600
Subject: [PATCH 059/214] evals mi325x-tw summary

---
 benchmarks/benchmark_lib.sh | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 3a9df22bc..582358279 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -399,15 +399,34 @@ append_lm_eval_summary() {
     set +x
     local results_dir="${EVAL_RESULT_DIR:-eval_out}"
     local task="${EVAL_TASK:-gsm8k}"
-    if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
-        python3 utils/lm_eval_to_md.py \
+    # Render markdown once, then decide where to write it to avoid redirection errors
+    local md_out
+    md_out=$(python3 utils/lm_eval_to_md.py \
             --results-dir "/workspace/${results_dir}" \
             --task "${task}" \
             --framework "${FRAMEWORK}" \
             --precision "${PRECISION}" \
             --tp "${TP:-1}" \
             --ep "${EP_SIZE:-1}" \
-            --dp-attention "${DP_ATTENTION:-false}" \
-            >> "$GITHUB_STEP_SUMMARY" || true
+            --dp-attention "${DP_ATTENTION:-false}" 2>/dev/null || true)
+
+    # If nothing was produced, nothing to append
+    if [ -z "${md_out}" ]; then
+        return 0
     fi
+
+    # Prefer GitHub step summary when available and path is valid; otherwise fallback to workspace file
+    if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
+        local _gh_path="$GITHUB_STEP_SUMMARY"
+        local _gh_dir
+        _gh_dir="$(dirname "$_gh_path")"
+        if [ -d "$_gh_dir" ]; then
+            printf "%s\n" "${md_out}" >> "$_gh_path" || true
+            return 0
+        fi
+    fi
+
+    # Fallback: write to a summary file alongside results
+    mkdir -p "/workspace/${results_dir}" 2>/dev/null || true
+    printf "%s\n" "${md_out}" >> "/workspace/${results_dir}/SUMMARY.md" || true
 }

From 71008bb68d19e0e87eed1612ad81c254e20b9f90 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 15 Nov 2025 16:16:04 -0600
Subject: [PATCH 060/214] evals mi325x-tw summary

---
 .github/workflows/eval-gms8k.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 2a0308f6f..35bd4751a 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -43,7 +43,7 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: mi325x-tw_0
+      runner: mi325x-tw_1
       image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm

From 7f3cd094551525bc50a315a1e2c705b021425cc7 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 15 Nov 2025 16:25:23 -0600
Subject: [PATCH 061/214] evals mi355x-amd

---
 .github/workflows/eval-gms8k.yml |  2 +-
 .github/workflows/eval-tmpl.yml  | 13 +++++
 benchmarks/benchmark_lib.sh      | 84 +++++++++++++++++++++++++++++---
 3 files changed, 92 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 35bd4751a..188b36546 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -43,7 +43,7 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: mi325x-tw_1
+      runner: mi355x-amd_4
       image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm
diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml
index 7b42a7853..2f2ccd621 100644
--- a/.github/workflows/eval-tmpl.yml
+++ b/.github/workflows/eval-tmpl.yml
@@ -125,6 +125,19 @@ jobs:
         run: |
           bash ./runners/launch_${RUNNER_NAME%%_*}.sh
 
+      - name: Append eval summary
+        if: always()
+        shell: bash
+        run: |
+          # If the compute node couldn't write to $GITHUB_STEP_SUMMARY directly,
+          # our scripts wrote a fallback markdown to ${EVAL_RESULT_DIR}/SUMMARY.md.
+          if [[ -f "${{ env.EVAL_RESULT_DIR }}/SUMMARY.md" ]]; then
+            echo "Appending evaluation summary to GitHub step summary"
+            cat "${{ env.EVAL_RESULT_DIR }}/SUMMARY.md" >> "$GITHUB_STEP_SUMMARY"
+          else
+            echo "No fallback summary found at '${{ env.EVAL_RESULT_DIR }}/SUMMARY.md'"
+          fi
+
       - name: Upload eval artifacts
         if: always()
         uses: actions/upload-artifact@v5
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 582358279..52e6cdbf8 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -233,17 +233,22 @@ _patch_lm_eval_filters() {
     local patch_dir
     patch_dir="$(mktemp -d)"
     cat > "$patch_dir/sitecustomize.py" <<'PY'
-import re, sys, unicodedata
+# sitecustomize.py — loaded automatically by Python if on PYTHONPATH
+import re, sys, unicodedata, types
+
+# -----------------------------
+# 1) Safe regex filters (yours)
+# -----------------------------
 from lm_eval.filters import extraction as ex
 
 def _s(x):  # coerce to str
     return x if isinstance(x, str) else ""
 
-# --- Patch RegexFilter.apply (used by many datasets) ---
+# --- RegexFilter.apply ---
 _orig_regex_apply = ex.RegexFilter.apply
 def _safe_regex_apply(self, resps, docs):
     out = []
-    for inst in resps:  # inst is a list of candidate responses for one doc
+    for inst in resps:  # list of candidates for one doc
         filtered = []
         for resp in inst:
             txt = _s(resp)
@@ -261,7 +266,7 @@ def _safe_regex_apply(self, resps, docs):
     return out
 ex.RegexFilter.apply = _safe_regex_apply
 
-# --- Patch MultiChoiceRegexFilter.apply (used by GSM8K flexible-extract) ---
+# --- MultiChoiceRegexFilter.apply (used by GSM8K flexible-extract) ---
 _orig_mc_apply = ex.MultiChoiceRegexFilter.apply
 def _safe_mc_apply(self, resps, docs):
     def find_match(regex, resp, convert_dict={}):
@@ -298,7 +303,7 @@ def _safe_mc_apply(self, resps, docs):
 
     out = []
     for r, doc in zip(resps, docs):
-        # Build fallback regexes from choices (A, B, C, ...) as in upstream
+        # Build fallback regexes from choices (A, B, C, ...) as upstream
         fallback_regexes, choice_to_alpha = [], {}
         next_alpha = "A"
         without_paren, without_paren_to_target = [], {}
@@ -325,8 +330,75 @@ def _safe_mc_apply(self, resps, docs):
             filtered.append(m)
         out.append(filtered)
     return out
-
 ex.MultiChoiceRegexFilter.apply = _safe_mc_apply
+
+# -----------------------------------------------------
+# 2) Fallback to reasoning_content in parse_generations
+# -----------------------------------------------------
+# For OpenAI-like chat completions, some servers return:
+#   choices[0].message.content == None
+#   choices[0].message.reasoning_content == "<text>"
+# If so, return reasoning_content instead of None; if both missing, return "".
+
+from lm_eval.models.api_models import TemplateAPI
+
+def _wrap_parse_generations_on_class(cls):
+    if not hasattr(cls, "parse_generations"):
+        return
+    orig = cls.parse_generations
+    # parse_generations is a @staticmethod on API models; preserve staticmethod
+    def wrapped(*, outputs, **kwargs):
+        # First, run the original
+        res = orig(outputs=outputs, **kwargs)
+        # Normalize to list for convenience
+        if isinstance(res, (str, type(None))):
+            res = [res]
+            outputs_list = [outputs]
+        else:
+            outputs_list = outputs if isinstance(outputs, list) else [outputs]
+
+        def _fallback_from_output(o):
+            try:
+                # OpenAI-style: dict -> choices[0] -> message
+                ch0 = (o or {}).get("choices", [{}])[0]
+                msg = ch0.get("message", {}) or {}
+                txt = msg.get("content")
+                if txt is None:
+                    # Newer servers may use reasoning_content
+                    txt = msg.get("reasoning_content")
+                if txt is None:
+                    # Some servers put it at choices[0].reasoning.content
+                    txt = (ch0.get("reasoning") or {}).get("content")
+                return "" if txt is None else txt
+            except Exception:
+                return ""
+        fb = [_fallback_from_output(o) for o in outputs_list]
+
+        # Replace None/empty only if a fallback exists
+        res_out = []
+        for i, v in enumerate(res):
+            if (v is None or v == "") and i < len(fb) and fb[i]:
+                res_out.append(fb[i])
+            else:
+                # still coerce None -> "" so downstream filters never see None
+                res_out.append("" if v is None else v)
+        return res_out
+
+    # Rebind as staticmethod to match original decoration
+    cls.parse_generations = staticmethod(wrapped)
+
+# Try to patch common OpenAI-like chat backends
+try:
+    from lm_eval.models import openai_like as oli
+    for name in dir(oli):
+        obj = getattr(oli, name)
+        if isinstance(obj, type) and issubclass(obj, TemplateAPI):
+            # Heuristically target chat-style classes only
+            if "Chat" in obj.__name__ or "OpenAI" in obj.__name__:
+                _wrap_parse_generations_on_class(obj)
+except Exception:
+    # If module layout changes, fail soft; your regex guards still protect filters.
+    pass
 PY
     export PYTHONPATH="${patch_dir}:${PYTHONPATH:-}"
 }

From dfff2f44ded4e03c88909ea2e3aec9f3e823a011 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 15 Nov 2025 16:43:41 -0600
Subject: [PATCH 062/214] evals mi325x-tw summary

---
 .github/workflows/eval-gms8k.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 188b36546..35bd4751a 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -43,7 +43,7 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: mi355x-amd_4
+      runner: mi325x-tw_1
       image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm

From 9a11152d612a7964e903806cbb989cb9e860a488 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 15 Nov 2025 17:06:59 -0600
Subject: [PATCH 063/214] evals mi325x-tw summary

---
 .github/workflows/eval-tmpl.yml | 13 -------------
 runners/launch_mi300x-amd.sh    | 11 +++++++++++
 runners/launch_mi325x-amd.sh    | 10 ++++++++++
 runners/launch_mi355x-amd.sh    | 11 +++++++++++
 4 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml
index 2f2ccd621..7b42a7853 100644
--- a/.github/workflows/eval-tmpl.yml
+++ b/.github/workflows/eval-tmpl.yml
@@ -125,19 +125,6 @@ jobs:
         run: |
           bash ./runners/launch_${RUNNER_NAME%%_*}.sh
 
-      - name: Append eval summary
-        if: always()
-        shell: bash
-        run: |
-          # If the compute node couldn't write to $GITHUB_STEP_SUMMARY directly,
-          # our scripts wrote a fallback markdown to ${EVAL_RESULT_DIR}/SUMMARY.md.
-          if [[ -f "${{ env.EVAL_RESULT_DIR }}/SUMMARY.md" ]]; then
-            echo "Appending evaluation summary to GitHub step summary"
-            cat "${{ env.EVAL_RESULT_DIR }}/SUMMARY.md" >> "$GITHUB_STEP_SUMMARY"
-          else
-            echo "No fallback summary found at '${{ env.EVAL_RESULT_DIR }}/SUMMARY.md'"
-          fi
-
       - name: Upload eval artifacts
         if: always()
         uses: actions/upload-artifact@v5
diff --git a/runners/launch_mi300x-amd.sh b/runners/launch_mi300x-amd.sh
index 780e5a2f0..85fa1f8c7 100644
--- a/runners/launch_mi300x-amd.sh
+++ b/runners/launch_mi300x-amd.sh
@@ -8,6 +8,16 @@ PORT=8888
 server_name="bmk-server"
 
 set -x
+# Propagate GitHub summary file into the container when available
+GH_SUM_ENV=""
+GH_SUM_MOUNT=""
+if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
+  GH_SUM_ENV="-e GITHUB_STEP_SUMMARY=${GITHUB_STEP_SUMMARY}"
+  GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")"
+  if [ -d "${GH_SUM_DIR}" ]; then
+    GH_SUM_MOUNT="-v ${GH_SUM_DIR}:${GH_SUM_DIR}"
+  fi
+fi
 docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
 --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \
 --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
@@ -15,6 +25,7 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \
 -e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \
+${GH_SUM_ENV} ${GH_SUM_MOUNT} \
 --entrypoint=/bin/bash \
 $IMAGE \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi300x_docker.sh"
diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh
index 1065167d7..008e42577 100644
--- a/runners/launch_mi325x-amd.sh
+++ b/runners/launch_mi325x-amd.sh
@@ -22,3 +22,13 @@ srun --jobid=$JOB_ID \
 bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_mi325x_slurm.sh
 
 scancel $JOB_ID
+
+# Append eval summary within this same step when available
+if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
+  GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")"
+  if [ -d "${GH_SUM_DIR}" ]; then
+    if [ -f "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" ]; then
+      cat "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" >> "${GITHUB_STEP_SUMMARY}" || true
+    fi
+  fi
+fi
diff --git a/runners/launch_mi355x-amd.sh b/runners/launch_mi355x-amd.sh
index 5f3cbb290..b1b11ff95 100644
--- a/runners/launch_mi355x-amd.sh
+++ b/runners/launch_mi355x-amd.sh
@@ -30,6 +30,16 @@ else
 fi
 
 set -x
+# Propagate GitHub summary file into the container when available
+GH_SUM_ENV=""
+GH_SUM_MOUNT=""
+if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
+  GH_SUM_ENV="-e GITHUB_STEP_SUMMARY=${GITHUB_STEP_SUMMARY}"
+  GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")"
+  if [ -d "${GH_SUM_DIR}" ]; then
+    GH_SUM_MOUNT="-v ${GH_SUM_DIR}:${GH_SUM_DIR}"
+  fi
+fi
 docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
 --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \
 --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
@@ -37,6 +47,7 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT -e NUM_PROMPTS \
 -e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME  \
+${GH_SUM_ENV} ${GH_SUM_MOUNT} \
 --entrypoint=/bin/bash \
 $IMAGE \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi355x_docker.sh"

From 1ead6959b4a946fa5f0538f381ced78527871f07 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 15 Nov 2025 17:18:11 -0600
Subject: [PATCH 064/214] evals mi325x-tw summary

---
 .github/workflows/eval-tmpl.yml |  2 +-
 runners/launch_mi325x-amd.sh    |  1 +
 runners/launch_mi325x-tw.sh     | 11 +++++++++++
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml
index 7b42a7853..c2363540a 100644
--- a/.github/workflows/eval-tmpl.yml
+++ b/.github/workflows/eval-tmpl.yml
@@ -68,7 +68,7 @@ env:
   CONC: '16'
   MAX_MODEL_LEN: '8192'
   ISL: 1024
-  OSL: 8192
+  OSL: 1024
   RANDOM_RANGE_RATIO: '1.0'
   RESULT_FILENAME: results
   
diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh
index 008e42577..68affc9a1 100644
--- a/runners/launch_mi325x-amd.sh
+++ b/runners/launch_mi325x-amd.sh
@@ -11,6 +11,7 @@ salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=256 --time=180 --no
 JOB_ID=$(squeue -u $USER -h -o %A | head -n1)
 
 srun --jobid=$JOB_ID bash -c "sudo enroot import -o $SQUASH_FILE docker://$IMAGE"
+
 srun --jobid=$JOB_ID \
 --container-image=$SQUASH_FILE \
 --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
diff --git a/runners/launch_mi325x-tw.sh b/runners/launch_mi325x-tw.sh
index 488ce6ceb..aa87a424d 100644
--- a/runners/launch_mi325x-tw.sh
+++ b/runners/launch_mi325x-tw.sh
@@ -11,6 +11,7 @@ salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=128 --time=180 --no
 JOB_ID=$(squeue -u $USER -h -o %A | head -n1)
 
 srun --jobid=$JOB_ID bash -c "sudo enroot import -o $SQUASH_FILE docker://$IMAGE"
+
 srun --jobid=$JOB_ID \
 --container-image=$SQUASH_FILE \
 --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
@@ -22,3 +23,13 @@ srun --jobid=$JOB_ID \
 bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_mi325x_slurm.sh
 
 scancel $JOB_ID
+
+# Fallback: append summary after job completes if container couldn't write directly
+if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
+  GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")"
+  if [ -d "${GH_SUM_DIR}" ]; then
+    if [ -f "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" ]; then
+      cat "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" >> "${GITHUB_STEP_SUMMARY}" || true
+    fi
+  fi
+fi

From 348d5d9dbdcc6441776a077ffdb94f7e52ccb438 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 15 Nov 2025 17:33:20 -0600
Subject: [PATCH 065/214] all summary

---
 runners/launch_b200-nb.sh    | 12 +++++++++++-
 runners/launch_b200-nv.sh    | 10 ++++++++++
 runners/launch_b200-nvd.sh   | 12 ++++++++++++
 runners/launch_b200-tg.sh    | 12 ++++++++++++
 runners/launch_h100-cr.sh    | 11 +++++++++++
 runners/launch_h100-cw.sh    | 10 ++++++++++
 runners/launch_h200-cw.sh    | 10 ++++++++++
 runners/launch_h200-nb.sh    | 10 ++++++++++
 runners/launch_h200-nv.sh    | 10 ++++++++++
 runners/launch_mi300x-cr.sh  | 11 +++++++++++
 runners/launch_mi300x-oci.sh | 11 +++++++++++
 11 files changed, 118 insertions(+), 1 deletion(-)

diff --git a/runners/launch_b200-nb.sh b/runners/launch_b200-nb.sh
index ecd1466dd..1502d0268 100644
--- a/runners/launch_b200-nb.sh
+++ b/runners/launch_b200-nb.sh
@@ -14,4 +14,14 @@ srun --partition=$PARTITION --gres=gpu:$TP --exclusive \
 --no-container-entrypoint --export=ALL,PORT_OFFSET=${USER: -1} \
 bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_slurm.sh
 
-scancel $JOB_ID
\ No newline at end of file
+scancel $JOB_ID
+
+# Append eval summary within this same step when available
+if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
+  GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")"
+  if [ -d "${GH_SUM_DIR}" ]; then
+    if [ -f "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" ]; then
+      cat "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" >> "${GITHUB_STEP_SUMMARY}" || true
+    fi
+  fi
+fi
diff --git a/runners/launch_b200-nv.sh b/runners/launch_b200-nv.sh
index 243e624f9..8a1afff8e 100644
--- a/runners/launch_b200-nv.sh
+++ b/runners/launch_b200-nv.sh
@@ -23,3 +23,13 @@ srun --jobid=$JOB_ID \
 bash benchmarks/${MODEL_CODE}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_slurm.sh
 
 scancel $JOB_ID
+
+# Append eval summary within this same step when available
+if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
+  GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")"
+  if [ -d "${GH_SUM_DIR}" ]; then
+    if [ -f "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" ]; then
+      cat "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" >> "${GITHUB_STEP_SUMMARY}" || true
+    fi
+  fi
+fi
diff --git a/runners/launch_b200-nvd.sh b/runners/launch_b200-nvd.sh
index c5216b006..12bb66b99 100644
--- a/runners/launch_b200-nvd.sh
+++ b/runners/launch_b200-nvd.sh
@@ -35,6 +35,17 @@ else
   export NUM_PROMPTS=$(( CONC * 10 ))
 fi
 
+## Propagate GitHub summary file into the container when available
+GH_SUM_ENV=""
+GH_SUM_MOUNT=""
+if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
+  GH_SUM_ENV="-e GITHUB_STEP_SUMMARY=${GITHUB_STEP_SUMMARY}"
+  GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")"
+  if [ -d "${GH_SUM_DIR}" ]; then
+    GH_SUM_MOUNT="-v ${GH_SUM_DIR}:${GH_SUM_DIR}"
+  fi
+fi
+
 docker run --rm --init --network host --name $server_name \
 --runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \
 -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
@@ -43,6 +54,7 @@ docker run --rm --init --network host --name $server_name \
 -e NCCL_GRAPH_REGISTER=0 \
 -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
 -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e NUM_PROMPTS \
+ ${GH_SUM_ENV} ${GH_SUM_MOUNT} \
 --entrypoint=/bin/bash \
 $(echo "$IMAGE" | sed 's/#/\//') \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh"
diff --git a/runners/launch_b200-tg.sh b/runners/launch_b200-tg.sh
index 97e975a64..b82e25276 100644
--- a/runners/launch_b200-tg.sh
+++ b/runners/launch_b200-tg.sh
@@ -7,6 +7,17 @@ PORT=8888
 server_name="bmk-server"
 
 set -x
+## Propagate GitHub summary file into the container when available
+GH_SUM_ENV=""
+GH_SUM_MOUNT=""
+if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
+  GH_SUM_ENV="-e GITHUB_STEP_SUMMARY=${GITHUB_STEP_SUMMARY}"
+  GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")"
+  if [ -d "${GH_SUM_DIR}" ]; then
+    GH_SUM_MOUNT="-v ${GH_SUM_DIR}:${GH_SUM_DIR}"
+  fi
+fi
+
 docker run --rm -d --network host --name $server_name \
 --runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \
 -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
@@ -14,6 +25,7 @@ docker run --rm -d --network host --name $server_name \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \
 -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
 -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \
+ ${GH_SUM_ENV} ${GH_SUM_MOUNT} \
 --entrypoint=/bin/bash \
 $(echo "$IMAGE" | sed 's/#/\//') \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh"
diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh
index d1ddc26de..9815e4884 100644
--- a/runners/launch_h100-cr.sh
+++ b/runners/launch_h100-cr.sh
@@ -6,12 +6,23 @@ PORT=8888
 server_name="bmk-server"
 
 set -x
+## Propagate GitHub summary file into the container when available
+GH_SUM_ENV=""
+GH_SUM_MOUNT=""
+if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
+  GH_SUM_ENV="-e GITHUB_STEP_SUMMARY=${GITHUB_STEP_SUMMARY}"
+  GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")"
+  if [ -d "${GH_SUM_DIR}" ]; then
+    GH_SUM_MOUNT="-v ${GH_SUM_DIR}:${GH_SUM_DIR}"
+  fi
+fi
 docker run --rm --network=host --name=$server_name \
 --runtime=nvidia --gpus=all --ipc=host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \
 -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \
 -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
+ ${GH_SUM_ENV} ${GH_SUM_MOUNT} \
 --entrypoint=/bin/bash \
 $IMAGE \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_h100_docker.sh"
diff --git a/runners/launch_h100-cw.sh b/runners/launch_h100-cw.sh
index 0179bdd57..864dc9c95 100644
--- a/runners/launch_h100-cw.sh
+++ b/runners/launch_h100-cw.sh
@@ -18,3 +18,13 @@ srun --jobid=$JOB_ID \
 bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_h100_slurm.sh
 
 scancel $JOB_ID
+
+# Append eval summary within this same step when available
+if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
+  GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")"
+  if [ -d "${GH_SUM_DIR}" ]; then
+    if [ -f "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" ]; then
+      cat "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" >> "${GITHUB_STEP_SUMMARY}" || true
+    fi
+  fi
+fi
diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh
index dd4937606..431e027f2 100644
--- a/runners/launch_h200-cw.sh
+++ b/runners/launch_h200-cw.sh
@@ -30,3 +30,13 @@ srun --jobid=$JOB_ID \
 bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}_slurm.sh
 
 scancel $JOB_ID
+
+# Append eval summary within this same step when available
+if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
+  GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")"
+  if [ -d "${GH_SUM_DIR}" ]; then
+    if [ -f "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" ]; then
+      cat "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" >> "${GITHUB_STEP_SUMMARY}" || true
+    fi
+  fi
+fi
diff --git a/runners/launch_h200-nb.sh b/runners/launch_h200-nb.sh
index c76b366d2..19d6e82ba 100644
--- a/runners/launch_h200-nb.sh
+++ b/runners/launch_h200-nb.sh
@@ -30,3 +30,13 @@ srun --jobid=$JOB_ID \
 bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}_slurm.sh
 
 scancel $JOB_ID
+
+# Append eval summary within this same step when available
+if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
+  GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")"
+  if [ -d "${GH_SUM_DIR}" ]; then
+    if [ -f "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" ]; then
+      cat "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" >> "${GITHUB_STEP_SUMMARY}" || true
+    fi
+  fi
+fi
diff --git a/runners/launch_h200-nv.sh b/runners/launch_h200-nv.sh
index 5319f8959..ca2ea6079 100644
--- a/runners/launch_h200-nv.sh
+++ b/runners/launch_h200-nv.sh
@@ -23,3 +23,13 @@ srun --jobid=$JOB_ID \
 bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}_slurm.sh
 
 scancel $JOB_ID
+
+# Append eval summary within this same step when available
+if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
+  GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")"
+  if [ -d "${GH_SUM_DIR}" ]; then
+    if [ -f "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" ]; then
+      cat "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" >> "${GITHUB_STEP_SUMMARY}" || true
+    fi
+  fi
+fi
diff --git a/runners/launch_mi300x-cr.sh b/runners/launch_mi300x-cr.sh
index 8fbdaee63..4c9d56e7e 100644
--- a/runners/launch_mi300x-cr.sh
+++ b/runners/launch_mi300x-cr.sh
@@ -8,6 +8,16 @@ PORT=8888
 server_name="bmk-server"
 
 set -x
+## Propagate GitHub summary file into the container when available
+GH_SUM_ENV=""
+GH_SUM_MOUNT=""
+if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
+  GH_SUM_ENV="-e GITHUB_STEP_SUMMARY=${GITHUB_STEP_SUMMARY}"
+  GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")"
+  if [ -d "${GH_SUM_DIR}" ]; then
+    GH_SUM_MOUNT="-v ${GH_SUM_DIR}:${GH_SUM_DIR}"
+  fi
+fi
 docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
 --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \
 --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
@@ -15,6 +25,7 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \
 -e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \
+ ${GH_SUM_ENV} ${GH_SUM_MOUNT} \
 --entrypoint=/bin/bash \
 $IMAGE \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi300x_docker.sh"
diff --git a/runners/launch_mi300x-oci.sh b/runners/launch_mi300x-oci.sh
index 33614a03c..f1123d722 100644
--- a/runners/launch_mi300x-oci.sh
+++ b/runners/launch_mi300x-oci.sh
@@ -6,6 +6,16 @@ PORT=8888
 server_name="bmk-server"
 
 set -x
+## Propagate GitHub summary file into the container when available
+GH_SUM_ENV=""
+GH_SUM_MOUNT=""
+if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
+  GH_SUM_ENV="-e GITHUB_STEP_SUMMARY=${GITHUB_STEP_SUMMARY}"
+  GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")"
+  if [ -d "${GH_SUM_DIR}" ]; then
+    GH_SUM_MOUNT="-v ${GH_SUM_DIR}:${GH_SUM_DIR}"
+  fi
+fi
 docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
 --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \
 --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
@@ -13,6 +23,7 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \
 -e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \
+ ${GH_SUM_ENV} ${GH_SUM_MOUNT} \
 --entrypoint=/bin/bash \
 $IMAGE \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi300x_docker.sh"

From 679caa66a213db5de9658dd17d04bc8298517f36 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 15 Nov 2025 18:11:05 -0600
Subject: [PATCH 066/214] evals b200-nvd

---
 .github/workflows/eval-gms8k.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 35bd4751a..3fc810a48 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -43,8 +43,8 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: mi325x-tw_1
-      image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }}
+      runner: b200-nvd_0
+      image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm
       precision: fp4

From eda5e2f46186553bf06c43aafabb763777a470f0 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 15 Nov 2025 22:55:17 -0600
Subject: [PATCH 067/214] evals b200-nvd 2

---
 .github/workflows/eval-gms8k.yml     | 1 +
 benchmarks/gptoss_fp4_b200_docker.sh | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 3fc810a48..af33fddbe 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -56,3 +56,4 @@ jobs:
       eval-task: gsm8k
       num-fewshot: ${{ inputs.num_fewshot || '5' }}
       limit: ${{ inputs.limit || '200' }}
+
diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh
index 1736701c4..8b5b6c881 100644
--- a/benchmarks/gptoss_fp4_b200_docker.sh
+++ b/benchmarks/gptoss_fp4_b200_docker.sh
@@ -71,4 +71,8 @@ run_benchmark_serving \
     --num-prompts "$NUM_PROMPTS" \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
-    --result-dir /workspace/
\ No newline at end of file
+    --result-dir /workspace/
+
+# After throughput, run evaluation (defaults to GSM8K)
+run_lm_eval --port "$PORT"
+append_lm_eval_summary
\ No newline at end of file

From 42151cc85070b6a268f6848ca4a3e03a88428aff Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 15 Nov 2025 22:59:45 -0600
Subject: [PATCH 068/214] evals b200-nvd 3

---
 .github/workflows/eval-gms8k.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index af33fddbe..a5265a57e 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -43,7 +43,7 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: b200-nvd_0
+      runner: b200-nvd_3
       image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm

From 512dfc065ee1ee02a73177648a7e35cc222b9350 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 15 Nov 2025 23:11:30 -0600
Subject: [PATCH 069/214] evals h100-cr

---
 .github/workflows/eval-gms8k.yml     | 2 +-
 benchmarks/gptoss_fp4_h100_docker.sh | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index a5265a57e..bef7a2d08 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -43,7 +43,7 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: b200-nvd_3
+      runner: h100-cr_0
       image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm
diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh
index e8f7bdd1d..212059b04 100644
--- a/benchmarks/gptoss_fp4_h100_docker.sh
+++ b/benchmarks/gptoss_fp4_h100_docker.sh
@@ -57,3 +57,7 @@ run_benchmark_serving \
     --max-concurrency 512 \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation (defaults to GSM8K)
+run_lm_eval --port "$PORT"
+append_lm_eval_summary
\ No newline at end of file

From 4de631dc5e74d596afc155f008ec65057fbeb6fe Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 15 Nov 2025 23:15:36 -0600
Subject: [PATCH 070/214] evals b200-nvd 1

---
 .github/workflows/eval-gms8k.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index bef7a2d08..aa442cdd2 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -43,7 +43,7 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: h100-cr_0
+      runner: b200-nvd_1
       image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm

From b33cb80edc8f1d13975b0b761aa7e67223f5941e Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 15 Nov 2025 23:20:48 -0600
Subject: [PATCH 071/214] evals h200-trt-cw

---
 .github/workflows/eval-gms8k.yml      | 4 ++--
 benchmarks/dsr1_fp8_h200_trt_slurm.sh | 4 ++++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index aa442cdd2..e44ed3854 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -43,10 +43,10 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: b200-nvd_1
+      runner: h200-cw_0
       image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
-      framework: vllm
+      framework: trt
       precision: fp4
       exp-name: gptoss_gsm8k_poc
       tp: ${{ inputs.tp || '4' }}
diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
index 4ece6f7bc..c829e66b5 100644
--- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
@@ -89,3 +89,7 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation (defaults to GSM8K)
+run_lm_eval --port "$PORT"
+append_lm_eval_summary

From 5babdb0811e148b86b0d36c2b557b48a75974bd8 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 15 Nov 2025 23:30:17 -0600
Subject: [PATCH 072/214] evals h200-trt-cw 2

---
 .github/workflows/eval-gms8k.yml        | 2 +-
 benchmarks/gptoss_fp4_h200_trt_slurm.sh | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index e44ed3854..8446adaf8 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -44,7 +44,7 @@ jobs:
     secrets: inherit
     with:
       runner: h200-cw_0
-      image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
+      image: ${{ inputs.image || 'nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-devs' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: trt
       precision: fp4
diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh
index 21d6ae02c..58768e3a2 100644
--- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh
+++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh
@@ -78,3 +78,7 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation (defaults to GSM8K)
+run_lm_eval --port "$PORT"
+append_lm_eval_summary
\ No newline at end of file

From 12a85b839c1dafc5087b7dfe8bd35d635c0aaa27 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 15 Nov 2025 23:37:39 -0600
Subject: [PATCH 073/214] evals h200-trt-cw 3

---
 .github/workflows/eval-gms8k.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 8446adaf8..3559abf31 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -44,7 +44,7 @@ jobs:
     secrets: inherit
     with:
       runner: h200-cw_0
-      image: ${{ inputs.image || 'nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-devs' }}
+      image: ${{ inputs.image || 'nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: trt
       precision: fp4

From eb2846facefb4ac9dc7f8da78e57086e7dc7dc7e Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 15 Nov 2025 23:38:59 -0600
Subject: [PATCH 074/214] evals h100-cr 2

---
 .github/workflows/eval-gms8k.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 3559abf31..bef7a2d08 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -43,10 +43,10 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: h200-cw_0
-      image: ${{ inputs.image || 'nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev' }}
+      runner: h100-cr_0
+      image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
-      framework: trt
+      framework: vllm
       precision: fp4
       exp-name: gptoss_gsm8k_poc
       tp: ${{ inputs.tp || '4' }}

From 41660708ef15e896ff315fa1527707ff9709a55e Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 15 Nov 2025 23:45:07 -0600
Subject: [PATCH 075/214] evals h200-trt-cw 4

---
 .github/workflows/eval-gms8k.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index bef7a2d08..2902d6ce0 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -17,7 +17,7 @@ on:
         description: "Tensor Parallel Size"
         required: false
         type: string
-        default: "4"
+        default: "1"
       port:
         description: "Server port"
         required: false
@@ -43,10 +43,10 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: h100-cr_0
-      image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
+      runner: h200-cw_0
+      image: ${{ inputs.image || 'nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
-      framework: vllm
+      framework: trt
       precision: fp4
       exp-name: gptoss_gsm8k_poc
       tp: ${{ inputs.tp || '4' }}

From 5f6b772d660c9d2d910a6070599809e912777942 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 15 Nov 2025 23:48:41 -0600
Subject: [PATCH 076/214] evals h200-trt-cw 5 (EP/TP HARD)

---
 .github/workflows/eval-gms8k.yml        | 4 ++--
 benchmarks/gptoss_fp4_h200_trt_slurm.sh | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 2902d6ce0..b0e90175a 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -1,4 +1,4 @@
-name: Eval - GSM8K on H100 (PoC)
+name: Eval - GSM8K (PoC)
 
 on:
   workflow_dispatch:
@@ -49,7 +49,7 @@ jobs:
       framework: trt
       precision: fp4
       exp-name: gptoss_gsm8k_poc
-      tp: ${{ inputs.tp || '4' }}
+      tp: ${{ inputs.tp || '1' }}
       ep: '1'
       dp-attn: false
       port: ${{ inputs.port || '8888' }}
diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh
index 58768e3a2..3c959a7b1 100644
--- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh
+++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh
@@ -50,12 +50,12 @@ trtllm-serve $MODEL \
 --max_num_tokens 20000 \
 --backend pytorch \
 --extra_llm_api_options gptoss-config.yml \
---ep_size=$EP_SIZE \
+--ep_size=1 \
 --trust_remote_code \
 --gpus_per_node 8 \
 --host 0.0.0.0 \
 --port $PORT \
---tp_size=$TP \
+--tp_size=1 \
 --pp_size=1 \
 > $SERVER_LOG 2>&1 &
 

From 30baa1f0b52da8c95979cefd5443fa42a1a9bd26 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 16 Nov 2025 00:19:25 -0600
Subject: [PATCH 077/214] evals h200-trt-cw 6 (EP/TP HARD)

---
 benchmarks/benchmark_lib.sh | 96 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 95 insertions(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 52e6cdbf8..e1115a602 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -234,7 +234,101 @@ _patch_lm_eval_filters() {
     patch_dir="$(mktemp -d)"
     cat > "$patch_dir/sitecustomize.py" <<'PY'
 # sitecustomize.py — loaded automatically by Python if on PYTHONPATH
-import re, sys, unicodedata, types
+import os, re, sys, unicodedata, types
+
+# --------------------------------------------------------
+# Transport-level shim: normalize chat completion requests
+# --------------------------------------------------------
+# Some lm-eval builds may emit Responses-style message shapes
+# (message.type, role "developer", structured content lists).
+# Many OpenAI-compatible servers for /v1/chat/completions expect
+# classic roles (system/user/assistant) and string content.
+#
+# This shim rewrites payloads sent to */v1/chat/completions into
+# the classic format. It is no-op for other endpoints.
+
+def _flatten_content_to_text(content):
+    if content is None:
+        return ""
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        parts = []
+        for p in content:
+            if not isinstance(p, dict):
+                continue
+            t = p.get("type") or p.get("role")
+            if t in ("text", "input_text", None):
+                txt = p.get("text")
+                if txt is None:
+                    txt = p.get("content")
+                if txt is None and isinstance(p.get("text"), dict):
+                    txt = p["text"].get("content")
+                if txt:
+                    parts.append(str(txt))
+        return "".join(parts)
+    try:
+        return str(content)
+    except Exception:
+        return ""
+
+def _normalize_messages(payload):
+    try:
+        msgs = payload.get("messages")
+        if not isinstance(msgs, list):
+            return payload
+        norm = []
+        for m in msgs:
+            if not isinstance(m, dict):
+                continue
+            role = m.get("role", "user")
+            if role == "developer":
+                role = "system"
+            m = {k: v for k, v in m.items() if k != "type"}
+            content = m.get("content")
+            if content is None:
+                content = m.get("text") if isinstance(m.get("text"), (str, list, dict)) else m.get("input")
+            m_out = {"role": role, "content": _flatten_content_to_text(content)}
+            if isinstance(m.get("name"), str):
+                m_out["name"] = m["name"]
+            norm.append(m_out)
+        payload["messages"] = norm
+    except Exception:
+        return payload
+    return payload
+
+def _patch_http_clients():
+    # requests
+    try:
+        import requests
+        _orig_req = requests.sessions.Session.request
+        def _wrapped_request(self, method, url, *args, **kwargs):
+            if isinstance(kwargs.get("json"), dict) and "/chat/completions" in str(url):
+                kwargs["json"] = _normalize_messages(dict(kwargs["json"]))
+            return _orig_req(self, method, url, *args, **kwargs)
+        requests.sessions.Session.request = _wrapped_request
+    except Exception:
+        pass
+    # httpx sync/async
+    try:
+        import httpx
+        _orig_httpx = httpx.Client.request
+        def _wrapped_httpx(self, method, url, *args, **kwargs):
+            if isinstance(kwargs.get("json"), dict) and "/chat/completions" in str(url):
+                kwargs["json"] = _normalize_messages(dict(kwargs["json"]))
+            return _orig_httpx(self, method, url, *args, **kwargs)
+        httpx.Client.request = _wrapped_httpx
+        _orig_async = httpx.AsyncClient.request
+        async def _wrapped_async(self, method, url, *args, **kwargs):
+            if isinstance(kwargs.get("json"), dict) and "/chat/completions" in str(url):
+                kwargs["json"] = _normalize_messages(dict(kwargs["json"]))
+            return await _orig_async(self, method, url, *args, **kwargs)
+        httpx.AsyncClient.request = _wrapped_async
+    except Exception:
+        pass
+
+if not os.environ.get("LM_EVAL_DISABLE_CHAT_SHIM"):
+    _patch_http_clients()
 
 # -----------------------------
 # 1) Safe regex filters (yours)

From 5a209fdd5453a52a8642115816e3cee0175b8eb5 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 16 Nov 2025 00:20:03 -0600
Subject: [PATCH 078/214] evals h200-trt-cw 6 (EP/TP HARD)

---
 .github/workflows/eval-gms8k.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index b0e90175a..a41894f12 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -32,7 +32,7 @@ on:
         description: "Sample limit for GSM8K"
         required: false
         type: string
-        default: "200"
+        default: "1300"
   push:
     paths:
       - '.github/workflows/eval-gms8k.yml'

From 89a9cbddb089e1e8a2ed756b4d948db5f5dcf21b Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 16 Nov 2025 00:37:07 -0600
Subject: [PATCH 079/214] evals h200-cw dsr1

---
 .github/workflows/eval-gms8k.yml  |  14 +-
 benchmarks/benchmark_lib.sh       | 206 ++----------------------------
 benchmarks/dsr1_fp8_h200_slurm.sh |   4 +
 3 files changed, 22 insertions(+), 202 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index a41894f12..47699b3a7 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -17,7 +17,7 @@ on:
         description: "Tensor Parallel Size"
         required: false
         type: string
-        default: "1"
+        default: "8"
       port:
         description: "Server port"
         required: false
@@ -44,12 +44,12 @@ jobs:
     secrets: inherit
     with:
       runner: h200-cw_0
-      image: ${{ inputs.image || 'nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev' }}
-      model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
-      framework: trt
-      precision: fp4
-      exp-name: gptoss_gsm8k_poc
-      tp: ${{ inputs.tp || '1' }}
+      image: ${{ inputs.image || 'lmsysorg/sglang:v0.5.5-cu129-amd64' }}
+      model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }}
+      framework: sglang
+      precision: fp8
+      exp-name: dsr1_gsm8k_poc
+      tp: '8'
       ep: '1'
       dp-attn: false
       port: ${{ inputs.port || '8888' }}
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index e1115a602..8f38d9e9c 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -233,116 +233,17 @@ _patch_lm_eval_filters() {
     local patch_dir
     patch_dir="$(mktemp -d)"
     cat > "$patch_dir/sitecustomize.py" <<'PY'
-# sitecustomize.py — loaded automatically by Python if on PYTHONPATH
-import os, re, sys, unicodedata, types
-
-# --------------------------------------------------------
-# Transport-level shim: normalize chat completion requests
-# --------------------------------------------------------
-# Some lm-eval builds may emit Responses-style message shapes
-# (message.type, role "developer", structured content lists).
-# Many OpenAI-compatible servers for /v1/chat/completions expect
-# classic roles (system/user/assistant) and string content.
-#
-# This shim rewrites payloads sent to */v1/chat/completions into
-# the classic format. It is no-op for other endpoints.
-
-def _flatten_content_to_text(content):
-    if content is None:
-        return ""
-    if isinstance(content, str):
-        return content
-    if isinstance(content, list):
-        parts = []
-        for p in content:
-            if not isinstance(p, dict):
-                continue
-            t = p.get("type") or p.get("role")
-            if t in ("text", "input_text", None):
-                txt = p.get("text")
-                if txt is None:
-                    txt = p.get("content")
-                if txt is None and isinstance(p.get("text"), dict):
-                    txt = p["text"].get("content")
-                if txt:
-                    parts.append(str(txt))
-        return "".join(parts)
-    try:
-        return str(content)
-    except Exception:
-        return ""
-
-def _normalize_messages(payload):
-    try:
-        msgs = payload.get("messages")
-        if not isinstance(msgs, list):
-            return payload
-        norm = []
-        for m in msgs:
-            if not isinstance(m, dict):
-                continue
-            role = m.get("role", "user")
-            if role == "developer":
-                role = "system"
-            m = {k: v for k, v in m.items() if k != "type"}
-            content = m.get("content")
-            if content is None:
-                content = m.get("text") if isinstance(m.get("text"), (str, list, dict)) else m.get("input")
-            m_out = {"role": role, "content": _flatten_content_to_text(content)}
-            if isinstance(m.get("name"), str):
-                m_out["name"] = m["name"]
-            norm.append(m_out)
-        payload["messages"] = norm
-    except Exception:
-        return payload
-    return payload
-
-def _patch_http_clients():
-    # requests
-    try:
-        import requests
-        _orig_req = requests.sessions.Session.request
-        def _wrapped_request(self, method, url, *args, **kwargs):
-            if isinstance(kwargs.get("json"), dict) and "/chat/completions" in str(url):
-                kwargs["json"] = _normalize_messages(dict(kwargs["json"]))
-            return _orig_req(self, method, url, *args, **kwargs)
-        requests.sessions.Session.request = _wrapped_request
-    except Exception:
-        pass
-    # httpx sync/async
-    try:
-        import httpx
-        _orig_httpx = httpx.Client.request
-        def _wrapped_httpx(self, method, url, *args, **kwargs):
-            if isinstance(kwargs.get("json"), dict) and "/chat/completions" in str(url):
-                kwargs["json"] = _normalize_messages(dict(kwargs["json"]))
-            return _orig_httpx(self, method, url, *args, **kwargs)
-        httpx.Client.request = _wrapped_httpx
-        _orig_async = httpx.AsyncClient.request
-        async def _wrapped_async(self, method, url, *args, **kwargs):
-            if isinstance(kwargs.get("json"), dict) and "/chat/completions" in str(url):
-                kwargs["json"] = _normalize_messages(dict(kwargs["json"]))
-            return await _orig_async(self, method, url, *args, **kwargs)
-        httpx.AsyncClient.request = _wrapped_async
-    except Exception:
-        pass
-
-if not os.environ.get("LM_EVAL_DISABLE_CHAT_SHIM"):
-    _patch_http_clients()
-
-# -----------------------------
-# 1) Safe regex filters (yours)
-# -----------------------------
+import re, sys, unicodedata
 from lm_eval.filters import extraction as ex
 
 def _s(x):  # coerce to str
     return x if isinstance(x, str) else ""
 
-# --- RegexFilter.apply ---
+# --- Patch RegexFilter.apply (used by many datasets) ---
 _orig_regex_apply = ex.RegexFilter.apply
 def _safe_regex_apply(self, resps, docs):
     out = []
-    for inst in resps:  # list of candidates for one doc
+    for inst in resps:  # inst is a list of candidate responses for one doc
         filtered = []
         for resp in inst:
             txt = _s(resp)
@@ -360,7 +261,7 @@ def _safe_regex_apply(self, resps, docs):
     return out
 ex.RegexFilter.apply = _safe_regex_apply
 
-# --- MultiChoiceRegexFilter.apply (used by GSM8K flexible-extract) ---
+# --- Patch MultiChoiceRegexFilter.apply (used by GSM8K flexible-extract) ---
 _orig_mc_apply = ex.MultiChoiceRegexFilter.apply
 def _safe_mc_apply(self, resps, docs):
     def find_match(regex, resp, convert_dict={}):
@@ -397,7 +298,7 @@ def _safe_mc_apply(self, resps, docs):
 
     out = []
     for r, doc in zip(resps, docs):
-        # Build fallback regexes from choices (A, B, C, ...) as upstream
+        # Build fallback regexes from choices (A, B, C, ...) as in upstream
         fallback_regexes, choice_to_alpha = [], {}
         next_alpha = "A"
         without_paren, without_paren_to_target = [], {}
@@ -424,75 +325,8 @@ def _safe_mc_apply(self, resps, docs):
             filtered.append(m)
         out.append(filtered)
     return out
-ex.MultiChoiceRegexFilter.apply = _safe_mc_apply
-
-# -----------------------------------------------------
-# 2) Fallback to reasoning_content in parse_generations
-# -----------------------------------------------------
-# For OpenAI-like chat completions, some servers return:
-#   choices[0].message.content == None
-#   choices[0].message.reasoning_content == "<text>"
-# If so, return reasoning_content instead of None; if both missing, return "".
-
-from lm_eval.models.api_models import TemplateAPI
-
-def _wrap_parse_generations_on_class(cls):
-    if not hasattr(cls, "parse_generations"):
-        return
-    orig = cls.parse_generations
-    # parse_generations is a @staticmethod on API models; preserve staticmethod
-    def wrapped(*, outputs, **kwargs):
-        # First, run the original
-        res = orig(outputs=outputs, **kwargs)
-        # Normalize to list for convenience
-        if isinstance(res, (str, type(None))):
-            res = [res]
-            outputs_list = [outputs]
-        else:
-            outputs_list = outputs if isinstance(outputs, list) else [outputs]
-
-        def _fallback_from_output(o):
-            try:
-                # OpenAI-style: dict -> choices[0] -> message
-                ch0 = (o or {}).get("choices", [{}])[0]
-                msg = ch0.get("message", {}) or {}
-                txt = msg.get("content")
-                if txt is None:
-                    # Newer servers may use reasoning_content
-                    txt = msg.get("reasoning_content")
-                if txt is None:
-                    # Some servers put it at choices[0].reasoning.content
-                    txt = (ch0.get("reasoning") or {}).get("content")
-                return "" if txt is None else txt
-            except Exception:
-                return ""
-        fb = [_fallback_from_output(o) for o in outputs_list]
 
-        # Replace None/empty only if a fallback exists
-        res_out = []
-        for i, v in enumerate(res):
-            if (v is None or v == "") and i < len(fb) and fb[i]:
-                res_out.append(fb[i])
-            else:
-                # still coerce None -> "" so downstream filters never see None
-                res_out.append("" if v is None else v)
-        return res_out
-
-    # Rebind as staticmethod to match original decoration
-    cls.parse_generations = staticmethod(wrapped)
-
-# Try to patch common OpenAI-like chat backends
-try:
-    from lm_eval.models import openai_like as oli
-    for name in dir(oli):
-        obj = getattr(oli, name)
-        if isinstance(obj, type) and issubclass(obj, TemplateAPI):
-            # Heuristically target chat-style classes only
-            if "Chat" in obj.__name__ or "OpenAI" in obj.__name__:
-                _wrap_parse_generations_on_class(obj)
-except Exception:
-    # If module layout changes, fail soft; your regex guards still protect filters.
-    pass
+ex.MultiChoiceRegexFilter.apply = _safe_mc_apply
 PY
     export PYTHONPATH="${patch_dir}:${PYTHONPATH:-}"
 }
@@ -565,34 +399,16 @@ append_lm_eval_summary() {
     set +x
     local results_dir="${EVAL_RESULT_DIR:-eval_out}"
     local task="${EVAL_TASK:-gsm8k}"
-    # Render markdown once, then decide where to write it to avoid redirection errors
-    local md_out
-    md_out=$(python3 utils/lm_eval_to_md.py \
+    if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
+        _ensure_bench_serving_repo
+        python3 XXX \
             --results-dir "/workspace/${results_dir}" \
             --task "${task}" \
             --framework "${FRAMEWORK}" \
             --precision "${PRECISION}" \
             --tp "${TP:-1}" \
             --ep "${EP_SIZE:-1}" \
-            --dp-attention "${DP_ATTENTION:-false}" 2>/dev/null || true)
-
-    # If nothing was produced, nothing to append
-    if [ -z "${md_out}" ]; then
-        return 0
+            --dp-attention "${DP_ATTENTION:-false}" \
+            >> "$GITHUB_STEP_SUMMARY" || true
     fi
-
-    # Prefer GitHub step summary when available and path is valid; otherwise fallback to workspace file
-    if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
-        local _gh_path="$GITHUB_STEP_SUMMARY"
-        local _gh_dir
-        _gh_dir="$(dirname "$_gh_path")"
-        if [ -d "$_gh_dir" ]; then
-            printf "%s\n" "${md_out}" >> "$_gh_path" || true
-            return 0
-        fi
-    fi
-
-    # Fallback: write to a summary file alongside results
-    mkdir -p "/workspace/${results_dir}" 2>/dev/null || true
-    printf "%s\n" "${md_out}" >> "/workspace/${results_dir}/SUMMARY.md" || true
 }
diff --git a/benchmarks/dsr1_fp8_h200_slurm.sh b/benchmarks/dsr1_fp8_h200_slurm.sh
index 06345ecb2..8e8ec7469 100644
--- a/benchmarks/dsr1_fp8_h200_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_slurm.sh
@@ -63,3 +63,7 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation (defaults to GSM8K)
+run_lm_eval --port "$PORT"
+append_lm_eval_summary
\ No newline at end of file

From 9254ef19d5c9bb26a535240df0a2c9192e07640e Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 16 Nov 2025 00:56:20 -0600
Subject: [PATCH 080/214] evals mi300x-cr dsr1

---
 .github/workflows/eval-gms8k.yml     | 4 ++--
 benchmarks/benchmark_lib.sh          | 4 ++--
 benchmarks/dsr1_fp8_mi300x_docker.sh | 4 ++++
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 47699b3a7..1689cc9c1 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -43,8 +43,8 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: h200-cw_0
-      image: ${{ inputs.image || 'lmsysorg/sglang:v0.5.5-cu129-amd64' }}
+      runner: mi300x-cr_0
+      image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' }}
       model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }}
       framework: sglang
       precision: fp8
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 8f38d9e9c..861d0f483 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -347,8 +347,8 @@ run_lm_eval() {
     local task="${EVAL_TASK:-gsm8k}"
     local num_fewshot="${NUM_FEWSHOT:-5}"
     local results_dir="${EVAL_RESULT_DIR:-eval_out}"
-    local batch_size=2
-    local gen_max_tokens=8192
+    local batch_size=3
+    local gen_max_tokens=4096
     local temperature=0
     local top_p=1
 
diff --git a/benchmarks/dsr1_fp8_mi300x_docker.sh b/benchmarks/dsr1_fp8_mi300x_docker.sh
index 8c269dd83..3e604f3ca 100644
--- a/benchmarks/dsr1_fp8_mi300x_docker.sh
+++ b/benchmarks/dsr1_fp8_mi300x_docker.sh
@@ -56,3 +56,7 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation (defaults to GSM8K)
+run_lm_eval --port "$PORT"
+append_lm_eval_summary
\ No newline at end of file

From 6705ea3a4b8bc68caa2654cffd1ef7cdfc271360 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 16 Nov 2025 00:58:07 -0600
Subject: [PATCH 081/214] evals mi300x-cr dsr1 2

---
 .github/workflows/eval-tmpl.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml
index c2363540a..274ddcf92 100644
--- a/.github/workflows/eval-tmpl.yml
+++ b/.github/workflows/eval-tmpl.yml
@@ -101,6 +101,7 @@ jobs:
               docker ps -aq 
             fi
           fi
+          sudo rm -rf /home/ubuntu/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out/openai__gpt-oss-120b
           if command -v squeue >/dev/null 2>&1; then
             echo "[Slurm] Cleaning up resources ..."
             scancel -u $USER || true

From c1fc6db41958493ca82ac9fbc3b5ac8f402bf4ce Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 16 Nov 2025 01:14:08 -0600
Subject: [PATCH 082/214] evals mi325x-cr dsr1

---
 .github/workflows/eval-gms8k.yml    | 2 +-
 benchmarks/benchmark_lib.sh         | 3 +--
 benchmarks/dsr1_fp8_mi325x_slurm.sh | 4 ++++
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 1689cc9c1..e7bb716ca 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -43,7 +43,7 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: mi300x-cr_0
+      runner: mi325x-amd_0
       image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' }}
       model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }}
       framework: sglang
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 861d0f483..224b4dfd5 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -400,8 +400,7 @@ append_lm_eval_summary() {
     local results_dir="${EVAL_RESULT_DIR:-eval_out}"
     local task="${EVAL_TASK:-gsm8k}"
     if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
-        _ensure_bench_serving_repo
-        python3 XXX \
+        python3 utils/lm_eval_to_md.py \
             --results-dir "/workspace/${results_dir}" \
             --task "${task}" \
             --framework "${FRAMEWORK}" \
diff --git a/benchmarks/dsr1_fp8_mi325x_slurm.sh b/benchmarks/dsr1_fp8_mi325x_slurm.sh
index 67e4cc394..4e66a64fb 100644
--- a/benchmarks/dsr1_fp8_mi325x_slurm.sh
+++ b/benchmarks/dsr1_fp8_mi325x_slurm.sh
@@ -42,3 +42,7 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation (defaults to GSM8K)
+run_lm_eval --port "$PORT"
+append_lm_eval_summary
\ No newline at end of file

From 090630a1555b6dd6b42102cf3be488b34b3de700 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 16 Nov 2025 01:28:05 -0600
Subject: [PATCH 083/214] evals mi325x-cr dsr1 2

---
 .github/workflows/eval-gms8k.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index e7bb716ca..2c1347cb9 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -43,7 +43,7 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: mi325x-amd_0
+      runner: mi325x-tw_0
       image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' }}
       model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }}
       framework: sglang

From d984d7a06ec27111bcfd48d4d0233c067ef2ec5a Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 16 Nov 2025 01:31:39 -0600
Subject: [PATCH 084/214] evals mi355x-amd dsr1

---
 .github/workflows/eval-gms8k.yml     | 2 +-
 .github/workflows/eval-tmpl.yml      | 2 +-
 benchmarks/dsr1_fp8_mi355x_docker.sh | 6 ++++--
 benchmarks/dsr1_fp8_mi355x_slurm.sh  | 5 +++++
 4 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 2c1347cb9..50c2390a9 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -43,7 +43,7 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: mi325x-tw_0
+      runner: mi355x-amd_4
       image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' }}
       model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }}
       framework: sglang
diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml
index 274ddcf92..433ec1a68 100644
--- a/.github/workflows/eval-tmpl.yml
+++ b/.github/workflows/eval-tmpl.yml
@@ -65,7 +65,7 @@ env:
   LIMIT: ${{ inputs.limit }}
   EVAL_RESULT_DIR: eval_out
   # Server-side concurrency default (used by some server scripts)
-  CONC: '16'
+  CONC: '8'
   MAX_MODEL_LEN: '8192'
   ISL: 1024
   OSL: 1024
diff --git a/benchmarks/dsr1_fp8_mi355x_docker.sh b/benchmarks/dsr1_fp8_mi355x_docker.sh
index d4f1dd013..17e51344a 100644
--- a/benchmarks/dsr1_fp8_mi355x_docker.sh
+++ b/benchmarks/dsr1_fp8_mi355x_docker.sh
@@ -58,5 +58,7 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
 
-
-    
+# After throughput, run evaluation (defaults to GSM8K)
+run_lm_eval --port "$PORT"
+append_lm_eval_summary
+set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp8_mi355x_slurm.sh b/benchmarks/dsr1_fp8_mi355x_slurm.sh
index fd6fe49fb..b16c8e247 100644
--- a/benchmarks/dsr1_fp8_mi355x_slurm.sh
+++ b/benchmarks/dsr1_fp8_mi355x_slurm.sh
@@ -51,3 +51,8 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation (defaults to GSM8K)
+run_lm_eval --port "$PORT"
+append_lm_eval_summary
+set +x
\ No newline at end of file

From fb66e33543ffeb769422cae7ce9a886531f8b288 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 16 Nov 2025 07:48:59 -0600
Subject: [PATCH 085/214] evals mi355x-amd dsr1 2

---
 .github/workflows/eval-tmpl.yml | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml
index 433ec1a68..9354c4f67 100644
--- a/.github/workflows/eval-tmpl.yml
+++ b/.github/workflows/eval-tmpl.yml
@@ -80,16 +80,20 @@ jobs:
     steps:
       - name: Resource cleanup
         run: |
-          if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then
+          # Helper to avoid indefinite hangs on flaky Docker daemons
+          safe_timeout() { timeout -k 5 30s "$@"; }
+
+          if command -v docker >/dev/null 2>&1 && safe_timeout docker info >/dev/null 2>&1; then
             host=$(hostname)
 
             if [[ "$host" == "b200-81" || "$host" == "b200-80" || "$host" == "b200-79" ]]; then
               echo "[INFO] Running container-by-container cleanup on $host"
-              for cid in $(docker ps -aq); do
+              cids=$(safe_timeout docker ps -aq || true)
+              for cid in $cids; do
                 echo "[INFO] Cleaning container $cid"
-                docker stop -t 90 "$cid" || true
-                docker wait "$cid" >/dev/null 2>&1 || true
-                docker rm -f "$cid" >/dev/null 2>&1 || true
+                safe_timeout docker stop -t 90 "$cid" || true
+                safe_timeout docker wait "$cid" >/dev/null 2>&1 || true
+                safe_timeout docker rm -f "$cid" >/dev/null 2>&1 || true
               done
               sleep 2
               if nvidia-smi --query-compute-apps=pid --format=csv,noheader | grep -q '[0-9]'; then
@@ -98,7 +102,7 @@ jobs:
               fi
             else
               echo "[Docker] looking at docker resources ..."
-              docker ps -aq 
+              safe_timeout docker ps -aq || echo "[WARN] docker ps timed out"
             fi
           fi
           sudo rm -rf /home/ubuntu/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out/openai__gpt-oss-120b

From d0eb0c4e2ab4369be6c98d361c31512a95ce86f3 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 16 Nov 2025 08:02:14 -0600
Subject: [PATCH 086/214] evals mi355x-amd dsr1 3

---
 .github/workflows/eval-tmpl.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml
index 9354c4f67..c98f4be0d 100644
--- a/.github/workflows/eval-tmpl.yml
+++ b/.github/workflows/eval-tmpl.yml
@@ -83,10 +83,9 @@ jobs:
           # Helper to avoid indefinite hangs on flaky Docker daemons
           safe_timeout() { timeout -k 5 30s "$@"; }
 
-          if command -v docker >/dev/null 2>&1 && safe_timeout docker info >/dev/null 2>&1; then
-            host=$(hostname)
-
-            if [[ "$host" == "b200-81" || "$host" == "b200-80" || "$host" == "b200-79" ]]; then
+          host=$(hostname)
+          if [[ "$host" == "b200-81" || "$host" == "b200-80" || "$host" == "b200-79" ]]; then
+            if command -v docker >/dev/null 2>&1; then
               echo "[INFO] Running container-by-container cleanup on $host"
               cids=$(safe_timeout docker ps -aq || true)
               for cid in $cids; do
@@ -101,9 +100,10 @@ jobs:
                 nvidia-smi || true
               fi
             else
-              echo "[Docker] looking at docker resources ..."
-              safe_timeout docker ps -aq || echo "[WARN] docker ps timed out"
+              echo "[Docker] docker client not found; skipping cleanup"
             fi
+          else
+            echo "[Docker] skipping docker cleanup on host $host"
           fi
           sudo rm -rf /home/ubuntu/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out/openai__gpt-oss-120b
           if command -v squeue >/dev/null 2>&1; then

From c1dc1a6b3e66c5042a39c083825bdb03d41e76a3 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 16 Nov 2025 08:07:21 -0600
Subject: [PATCH 087/214] evals mi355x-amd dsr1 4

---
 .github/workflows/eval-gms8k.yml |  6 +++---
 .github/workflows/eval-tmpl.yml  | 31 +++++++++++++++++++++++++------
 2 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 50c2390a9..860ee5987 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -43,11 +43,11 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: mi355x-amd_4
+      runner: mi355x-amd_5
       image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' }}
-      model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }}
+      model: ${{ inputs.model || 'amd/DeepSeek-R1-0528-MXFP4-Preview' }}
       framework: sglang
-      precision: fp8
+      precision: fp4
       exp-name: dsr1_gsm8k_poc
       tp: '8'
       ep: '1'
diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml
index c98f4be0d..2d2e8404a 100644
--- a/.github/workflows/eval-tmpl.yml
+++ b/.github/workflows/eval-tmpl.yml
@@ -80,8 +80,14 @@ jobs:
     steps:
       - name: Resource cleanup
         run: |
-          # Helper to avoid indefinite hangs on flaky Docker daemons
-          safe_timeout() { timeout -k 5 30s "$@"; }
+          # Helper to avoid indefinite hangs on flaky tools (Docker/Slurm)
+          safe_timeout() {
+            if command -v timeout >/dev/null 2>&1; then
+              timeout -k 5 30s "$@"
+            else
+              "$@"
+            fi
+          }
 
           host=$(hostname)
           if [[ "$host" == "b200-81" || "$host" == "b200-80" || "$host" == "b200-79" ]]; then
@@ -105,14 +111,27 @@ jobs:
           else
             echo "[Docker] skipping docker cleanup on host $host"
           fi
-          sudo rm -rf /home/ubuntu/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out/openai__gpt-oss-120b
+          # Best-effort cleanup of prior eval outputs; do not block
+          safe_timeout sudo rm -rf /home/ubuntu/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out/openai__gpt-oss-120b || true
+
           if command -v squeue >/dev/null 2>&1; then
             echo "[Slurm] Cleaning up resources ..."
-            scancel -u $USER || true
-            while [ -n "$(squeue -u $USER --noheader --format='%i')" ]; do
-              squeue -u $USER || true
+            safe_timeout scancel -u "$USER" || true
+            # Wait up to 5 minutes for jobs to clear to avoid indefinite hang
+            end=$((SECONDS + 300))
+            while [ $SECONDS -lt $end ]; do
+              queued=$(safe_timeout squeue -u "$USER" --noheader --format='%i' 2>/dev/null || true)
+              if [ -z "$queued" ]; then
+                break
+              fi
+              echo "$queued" | sed 's/^/[Slurm] pending job: /' || true
               sleep 5
             done
+            # Final status; do not block
+            safe_timeout squeue -u "$USER" || true
+            if [ -n "$(safe_timeout squeue -u "$USER" --noheader --format='%i' 2>/dev/null || true)" ]; then
+              echo "[Slurm] Jobs still present after timeout; proceeding"
+            fi
           fi
 
       - uses: actions/checkout@v5

From 88d3bf5730d0c71d83dafcbfdbb082416de0a887 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 16 Nov 2025 08:15:45 -0600
Subject: [PATCH 088/214] evals b200-nvd dsr1

---
 .github/workflows/eval-gms8k.yml   | 6 +++---
 benchmarks/dsr1_fp4_b200_docker.sh | 3 +++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 860ee5987..52bdde77e 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -43,9 +43,9 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: mi355x-amd_5
-      image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' }}
-      model: ${{ inputs.model || 'amd/DeepSeek-R1-0528-MXFP4-Preview' }}
+      runner: b200-nvd_2
+      image: ${{ inputs.image || 'lmsysorg/sglang:v0.5.5-cu129-amd64' }}
+      model: ${{ inputs.model || 'nvidia/DeepSeek-R1-0528-FP4-V2' }}
       framework: sglang
       precision: fp4
       exp-name: dsr1_gsm8k_poc
diff --git a/benchmarks/dsr1_fp4_b200_docker.sh b/benchmarks/dsr1_fp4_b200_docker.sh
index a520871fa..656085fef 100644
--- a/benchmarks/dsr1_fp4_b200_docker.sh
+++ b/benchmarks/dsr1_fp4_b200_docker.sh
@@ -48,3 +48,6 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
 
+# After throughput, run evaluation (defaults to GSM8K)
+run_lm_eval --port "$PORT"
+append_lm_eval_summary
\ No newline at end of file

From 8a0677d29c6c2ffd0f5ee7ead02f5ed3b2c10dc5 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 16 Nov 2025 08:20:21 -0600
Subject: [PATCH 089/214] evals b200-nvd fp8 dsr1

---
 .github/workflows/eval-gms8k.yml   | 4 ++--
 benchmarks/dsr1_fp8_b200_docker.sh | 6 +++++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 52bdde77e..eea4a707f 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -45,9 +45,9 @@ jobs:
     with:
       runner: b200-nvd_2
       image: ${{ inputs.image || 'lmsysorg/sglang:v0.5.5-cu129-amd64' }}
-      model: ${{ inputs.model || 'nvidia/DeepSeek-R1-0528-FP4-V2' }}
+      model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }}
       framework: sglang
-      precision: fp4
+      precision: fp8
       exp-name: dsr1_gsm8k_poc
       tp: '8'
       ep: '1'
diff --git a/benchmarks/dsr1_fp8_b200_docker.sh b/benchmarks/dsr1_fp8_b200_docker.sh
index ffa7644bd..e68397661 100644
--- a/benchmarks/dsr1_fp8_b200_docker.sh
+++ b/benchmarks/dsr1_fp8_b200_docker.sh
@@ -57,4 +57,8 @@ run_benchmark_serving \
     --num-prompts "$NUM_PROMPTS" \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
-    --result-dir /workspace/
\ No newline at end of file
+    --result-dir /workspace/
+
+# After throughput, run evaluation (defaults to GSM8K)
+run_lm_eval --port "$PORT"
+append_lm_eval_summary
\ No newline at end of file

From f862af787c80e9725c40915bb8997292e2cf6146 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Thu, 20 Nov 2025 20:51:15 -0600
Subject: [PATCH 090/214] Lighteval 1

---
 .github/workflows/eval-gms8k.yml       |  10 +-
 benchmarks/benchmark_lib.sh            | 486 ++++++++++++++++---------
 benchmarks/gptoss_fp4_mi300x_docker.sh |   8 +-
 3 files changed, 318 insertions(+), 186 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index eea4a707f..9ed618cda 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -43,11 +43,11 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: b200-nvd_2
-      image: ${{ inputs.image || 'lmsysorg/sglang:v0.5.5-cu129-amd64' }}
-      model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }}
-      framework: sglang
-      precision: fp8
+      runner: mi300x-amd_0
+      image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }}
+      model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
+      framework: vllm
+      precision: fp4
       exp-name: dsr1_gsm8k_poc
       tp: '8'
       ep: '1'
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 224b4dfd5..bf96c8bda 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -1,12 +1,15 @@
 #!/usr/bin/env bash
 
-# Shared benchmarking utilities for InferenceMAX
+# Shared benchmarking + evaluation utilities for InferenceMAX
+
+# ---------------------------------
+# Server readiness / benchmarks
+# ---------------------------------
 
 # Wait for server to be ready by polling the health endpoint
-# All parameters are required
 # Parameters:
-#   --port: Server port
-#   --server-log: Path to server log file
+#   --port: Server port (required)
+#   --server-log: Path to server log file (required)
 #   --server-pid: Server process ID (required)
 #   --sleep-interval: Sleep interval between health checks (optional, default: 5)
 wait_for_server_ready() {
@@ -16,73 +19,37 @@ wait_for_server_ready() {
     local server_pid=""
     local sleep_interval=5
 
-    # Parse arguments
     while [[ $# -gt 0 ]]; do
         case $1 in
-            --port)
-                port="$2"
-                shift 2
-                ;;
-            --server-log)
-                server_log="$2"
-                shift 2
-                ;;
-            --server-pid)
-                server_pid="$2"
-                shift 2
-                ;;
-            --sleep-interval)
-                sleep_interval="$2"
-                shift 2
-                ;;
-            *)
-                echo "Unknown parameter: $1"
-                return 1
-                ;;
+            --port)           port="$2"; shift 2 ;;
+            --server-log)     server_log="$2"; shift 2 ;;
+            --server-pid)     server_pid="$2"; shift 2 ;;
+            --sleep-interval) sleep_interval="$2"; shift 2 ;;
+            *)                echo "Unknown parameter: $1"; return 1 ;;
         esac
     done
 
-    # Validate required parameters
-    if [[ -z "$port" ]]; then
-        echo "Error: --port is required"
-        return 1
-    fi
-    if [[ -z "$server_log" ]]; then
-        echo "Error: --server-log is required"
-        return 1
-    fi
-    if [[ -z "$server_pid" ]]; then
-        echo "Error: --server-pid is required"
-        return 1
-    fi
+    if [[ -z "$port" ]]; then echo "Error: --port is required"; return 1; fi
+    if [[ -z "$server_log" ]]; then echo "Error: --server-log is required"; return 1; fi
+    if [[ -z "$server_pid" ]]; then echo "Error: --server-pid is required"; return 1; fi
 
     # Show logs until server is ready
     tail -f "$server_log" &
     local TAIL_PID=$!
-    until curl --output /dev/null --silent --fail http://0.0.0.0:$port/health; do
+    
+    until curl --output /dev/null --silent --fail "http://0.0.0.0:$port/health"; do
         if ! kill -0 "$server_pid" 2>/dev/null; then
             echo "Server died before becoming healthy. Exiting."
-            kill $TAIL_PID
+            kill "$TAIL_PID"
             exit 1
         fi
         sleep "$sleep_interval"
     done
-    kill $TAIL_PID
+    kill "$TAIL_PID"
 }
 
 # Run benchmark serving with standardized parameters
-# All parameters are required
-# Parameters:
-#   --model: Model name
-#   --port: Server port
-#   --backend: Backend type - 'vllm' or 'openai'
-#   --input-len: Random input sequence length
-#   --output-len: Random output sequence length
-#   --random-range-ratio: Random range ratio
-#   --num-prompts: Number of prompts
-#   --max-concurrency: Max concurrency
-#   --result-filename: Result filename without extension
-#   --result-dir: Result directory
+# All parameters are required unless otherwise noted
 run_benchmark_serving() {
     set +x
     local model=""
@@ -95,104 +62,43 @@ run_benchmark_serving() {
     local max_concurrency=""
     local result_filename=""
     local result_dir=""
+    local tokenizer=""
 
-    # Parse arguments
     while [[ $# -gt 0 ]]; do
         case $1 in
-            --model)
-                model="$2"
-                shift 2
-                ;;
-            --port)
-                port="$2"
-                shift 2
-                ;;
-            --backend)
-                backend="$2"
-                shift 2
-                ;;
-            --input-len)
-                input_len="$2"
-                shift 2
-                ;;
-            --output-len)
-                output_len="$2"
-                shift 2
-                ;;
-            --random-range-ratio)
-                random_range_ratio="$2"
-                shift 2
-                ;;
-            --num-prompts)
-                num_prompts="$2"
-                shift 2
-                ;;
-            --max-concurrency)
-                max_concurrency="$2"
-                shift 2
-                ;;
-            --result-filename)
-                result_filename="$2"
-                shift 2
-                ;;
-            --result-dir)
-                result_dir="$2"
-                shift 2
-                ;;
-            *)
-                echo "Unknown parameter: $1"
-                return 1
-                ;;
+            --model)              model="$2"; shift 2 ;;
+            --port)               port="$2"; shift 2 ;;
+            --backend)            backend="$2"; shift 2 ;;
+            --input-len)          input_len="$2"; shift 2 ;;
+            --output-len)         output_len="$2"; shift 2 ;;
+            --random-range-ratio) random_range_ratio="$2"; shift 2 ;;
+            --num-prompts)        num_prompts="$2"; shift 2 ;;
+            --max-concurrency)    max_concurrency="$2"; shift 2 ;;
+            --result-filename)    result_filename="$2"; shift 2 ;;
+            --result-dir)         result_dir="$2"; shift 2 ;;
+            --tokenizer)          tokenizer="$2"; shift 2 ;;
+            *)                    echo "Unknown parameter: $1"; return 1 ;;
         esac
     done
 
-    # Validate all required parameters
-    if [[ -z "$model" ]]; then
-        echo "Error: --model is required"
-        return 1
-    fi
-    if [[ -z "$port" ]]; then
-        echo "Error: --port is required"
-        return 1
-    fi
-    if [[ -z "$backend" ]]; then
-        echo "Error: --backend is required"
-        return 1
-    fi
-    if [[ -z "$input_len" ]]; then
-        echo "Error: --input-len is required"
-        return 1
-    fi
-    if [[ -z "$output_len" ]]; then
-        echo "Error: --output-len is required"
-        return 1
-    fi
-    if [[ -z "$random_range_ratio" ]]; then
-        echo "Error: --random-range-ratio is required"
-        return 1
-    fi
-    if [[ -z "$num_prompts" ]]; then
-        echo "Error: --num-prompts is required"
-        return 1
-    fi
-    if [[ -z "$max_concurrency" ]]; then
-        echo "Error: --max-concurrency is required"
-        return 1
-    fi
-    if [[ -z "$result_filename" ]]; then
-        echo "Error: --result-filename is required"
-        return 1
-    fi
-    if [[ -z "$result_dir" ]]; then
-        echo "Error: --result-dir is required"
-        return 1
-    fi
+    # Validation
+    local vars=(model port backend input_len output_len random_range_ratio num_prompts max_concurrency result_filename result_dir)
+    for var in "${vars[@]}"; do
+        if [[ -z "${!var}" ]]; then
+            echo "Error: --${var//_/-} is required"
+            return 1
+        fi
+    done
 
-    # Clone benchmark serving repo
-    local BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
+    local BENCH_SERVING_DIR
+    BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
     git clone https://github.com/kimbochen/bench_serving.git "$BENCH_SERVING_DIR"
 
-    # Run benchmark
+    local extra_tokenizer_args=()
+    if [[ -n "$tokenizer" ]]; then
+        extra_tokenizer_args=(--tokenizer "$tokenizer")
+    fi
+
     set -x
     python3 "$BENCH_SERVING_DIR/benchmark_serving.py" \
         --model "$model" \
@@ -204,6 +110,7 @@ run_benchmark_serving() {
         --random-range-ratio "$random_range_ratio" \
         --num-prompts "$num_prompts" \
         --max-concurrency "$max_concurrency" \
+        "${extra_tokenizer_args[@]}" \
         --request-rate inf \
         --ignore-eos \
         --save-result \
@@ -218,7 +125,6 @@ run_benchmark_serving() {
 # Eval (lm-eval-harness) helpers
 # ------------------------------
 
-# Install or update lm-eval dependencies
 _install_lm_eval_deps() {
     set +x
     python3 -m pip install -q --no-cache-dir "lm-eval[api]" || true
@@ -239,11 +145,11 @@ from lm_eval.filters import extraction as ex
 def _s(x):  # coerce to str
     return x if isinstance(x, str) else ""
 
-# --- Patch RegexFilter.apply (used by many datasets) ---
+# --- Patch RegexFilter.apply ---
 _orig_regex_apply = ex.RegexFilter.apply
 def _safe_regex_apply(self, resps, docs):
     out = []
-    for inst in resps:  # inst is a list of candidate responses for one doc
+    for inst in resps:
         filtered = []
         for resp in inst:
             txt = _s(resp)
@@ -261,7 +167,7 @@ def _safe_regex_apply(self, resps, docs):
     return out
 ex.RegexFilter.apply = _safe_regex_apply
 
-# --- Patch MultiChoiceRegexFilter.apply (used by GSM8K flexible-extract) ---
+# --- Patch MultiChoiceRegexFilter.apply ---
 _orig_mc_apply = ex.MultiChoiceRegexFilter.apply
 def _safe_mc_apply(self, resps, docs):
     def find_match(regex, resp, convert_dict={}):
@@ -298,7 +204,6 @@ def _safe_mc_apply(self, resps, docs):
 
     out = []
     for r, doc in zip(resps, docs):
-        # Build fallback regexes from choices (A, B, C, ...) as in upstream
         fallback_regexes, choice_to_alpha = [], {}
         next_alpha = "A"
         without_paren, without_paren_to_target = [], {}
@@ -331,16 +236,6 @@ PY
     export PYTHONPATH="${patch_dir}:${PYTHONPATH:-}"
 }
 
-# Run an lm-eval-harness task against a local OpenAI-compatible server
-# Parameters:
-#   --port:              Server port (default: $PORT or 8888)
-#   --task:              Eval task (default: $EVAL_TASK or gsm8k)
-#   --num-fewshot:       Fewshot k (default: $NUM_FEWSHOT or 5)
-#   --results-dir:       Output dir (default: $EVAL_RESULT_DIR or eval_out)
-#   --batch-size:        Harness batch size (default: 2)
-#   --gen-max-tokens:    Max tokens for generation (default: 8192)
-#   --temperature:       Temperature (default: 0)
-#   --top-p:             Top-p (default: 1)
 run_lm_eval() {
     set +x
     local port="${PORT:-8888}"
@@ -352,35 +247,25 @@ run_lm_eval() {
     local temperature=0
     local top_p=1
 
-    # Parse arguments
     while [[ $# -gt 0 ]]; do
         case $1 in
-            --port)
-                port="$2"; shift 2;;
-            --task)
-                task="$2"; shift 2;;
-            --num-fewshot)
-                num_fewshot="$2"; shift 2;;
-            --results-dir)
-                results_dir="$2"; shift 2;;
-            --batch-size)
-                batch_size="$2"; shift 2;;
-            --gen-max-tokens)
-                gen_max_tokens="$2"; shift 2;;
-            --temperature)
-                temperature="$2"; shift 2;;
-            --top-p)
-                top_p="$2"; shift 2;;
-            *)
-                echo "Unknown parameter: $1"; return 1;;
+            --port)           port="$2"; shift 2 ;;
+            --task)           task="$2"; shift 2 ;;
+            --num-fewshot)    num_fewshot="$2"; shift 2 ;;
+            --results-dir)    results_dir="$2"; shift 2 ;;
+            --batch-size)     batch_size="$2"; shift 2 ;;
+            --gen-max-tokens) gen_max_tokens="$2"; shift 2 ;;
+            --temperature)    temperature="$2"; shift 2 ;;
+            --top-p)          top_p="$2"; shift 2 ;;
+            *)                echo "Unknown parameter: $1"; return 1 ;;
         esac
     done
- 
+
     _install_lm_eval_deps
     _patch_lm_eval_filters
 
     local openai_server_base="http://0.0.0.0:${port}"
-    local openai_chat_base="$openai_server_base/v1/chat/completions"
+    local openai_chat_base="${openai_server_base}/v1/chat/completions"
     export OPENAI_API_KEY=${OPENAI_API_KEY:-EMPTY}
 
     set -x
@@ -394,7 +279,6 @@ run_lm_eval() {
     set +x
 }
 
-# Append a Markdown summary to GitHub step summary (no-op if not in GH Actions)
 append_lm_eval_summary() {
     set +x
     local results_dir="${EVAL_RESULT_DIR:-eval_out}"
@@ -411,3 +295,247 @@ append_lm_eval_summary() {
             >> "$GITHUB_STEP_SUMMARY" || true
     fi
 }
+
+
+# ------------------------------
+# Lighteval + LiteLLM patching
+# ------------------------------
+
+_install_lighteval_deps() {
+    set +x
+    python3 -m pip install -q --no-cache-dir "lighteval[api]" "litellm" || true
+}
+
+# Patch lighteval's LiteLLMClient to handle reasoning content and Python name mangling
+_patch_lighteval_litellm() {
+    set +x
+    local patch_dir
+    patch_dir="$(mktemp -d)"
+    cat > "$patch_dir/sitecustomize.py" <<'PY'
+import logging
+import time
+
+import litellm
+from tqdm import tqdm
+
+litellm.suppress_debug_info = True
+
+from lighteval.models.endpoints.litellm_model import LiteLLMClient
+from lighteval.data import GenerativeTaskDataset
+from lighteval.tasks.requests import Doc, SamplingMethod
+from lighteval.models.model_output import ModelResponse
+from lighteval.utils.cache_management import cached
+
+logger = logging.getLogger(__name__)
+
+# --- Patched __call_api: don't retry when we have reasoning_content ---
+def _patched___call_api(self, prompt, return_logits, max_new_tokens, num_samples, stop_sequence):  # noqa: C901
+    """Make API call with retries, but don't treat reasoning-only responses as empty."""
+    from lighteval.models.endpoints.litellm_model import LitellmModelResponse
+
+    response = LitellmModelResponse()
+    stop_sequence = self._prepare_stop_sequence(stop_sequence)
+    max_new_tokens = self._prepare_max_new_tokens(max_new_tokens)
+
+    if return_logits and not self.provider == "openai":
+        logger.warning("Returning logits is not supported for this provider, ignoring.")
+
+    kwargs = {
+        "model": self.model,
+        "messages": prompt,
+        "response_format": {"type": "text"},
+        "max_tokens": max_new_tokens,
+        "logprobs": return_logits if self.provider == "openai" else None,
+        "stop": stop_sequence,
+        "base_url": self.base_url,
+        "api_key": self.api_key,
+        "n": num_samples,
+        "caching": True,
+        "timeout": self.timeout,
+    }
+
+    if "o1" in self.model:
+        logger.warning("O1 models do not support temperature, top_p, stop sequence. Disabling.")
+    else:
+        kwargs.update(self.generation_parameters.to_litellm_dict())
+
+    if kwargs.get("max_completion_tokens", None) is None:
+        kwargs["max_completion_tokens"] = max_new_tokens
+
+    for attempt in range(self.API_MAX_RETRY):
+        try:
+            response = litellm.completion(**kwargs)
+            msg = response.choices[0].message
+            content = msg.content
+            reasoning = getattr(msg, "reasoning_content", None)
+
+            if (not content) and reasoning:
+                return response
+
+            if not content:
+                logger.info("Response is empty, retrying without caching")
+                kwargs["caching"] = False
+                response = litellm.completion(**kwargs)
+                msg = response.choices[0].message
+                content = msg.content
+                reasoning = getattr(msg, "reasoning_content", None)
+
+            return response
+        except litellm.BadRequestError as e:
+            if "message" in e.__dict__ and "policy" in e.__dict__["message"]:
+                logger.warning(f"Content filtered. Returning empty response.")
+                return LitellmModelResponse()
+        except Exception as e:
+            wait_time = min(64, self.API_RETRY_SLEEP * (self.API_RETRY_MULTIPLIER**attempt))
+            logger.warning(f"Error: {e}, waiting {wait_time}s before retry {attempt + 1}/{self.API_MAX_RETRY}")
+            time.sleep(wait_time)
+
+    logger.error(f"API call failed after {self.API_MAX_RETRY} attempts.")
+    return LitellmModelResponse()
+
+# APPLY PATCH: Must use mangled name because original was private (__call_api)
+LiteLLMClient._LiteLLMClient__call_api = _patched___call_api
+
+# --- Patched greedy_until: merge reasoning + content, preserve ordering ---
+def _greedy_until_impl(self, docs: list[Doc]) -> list[ModelResponse]:
+    dataset = GenerativeTaskDataset(requests=docs, num_dataset_splits=self.DATASET_SPLITS)
+    results: list[ModelResponse] = []
+
+    for split in tqdm(
+        dataset.splits_iterator(),
+        total=dataset.num_dataset_splits,
+        desc="Splits",
+        position=0,
+        disable=self.disable_tqdm,
+    ):
+        contexts = [self.prompt_manager.prepare_prompt_api(doc) for doc in dataset]
+
+        max_new_tokens = split[0].generation_size
+        return_logits = split[0].use_logits
+        num_samples = split[0].num_samples
+        stop_sequence = split[0].stop_sequences
+
+        if num_samples > 1 and self.generation_parameters.temperature == 0:
+            raise ValueError("num_samples > 1 requires temperature > 0")
+
+        # CRITICAL FIX: Access the private method via mangled name
+        responses = self._LiteLLMClient__call_api_parallel(
+            contexts,
+            return_logits,
+            max_new_tokens,
+            num_samples,
+            stop_sequence,
+        )
+
+        for response, context in zip(responses, contexts):
+            raw_contents = [(choice.message.content or "").strip() for choice in response.choices]
+            raw_reasonings = [(getattr(choice.message, "reasoning_content", None) or "").strip() for choice in response.choices]
+
+            merged: list[str] = []
+            for c, r in zip(raw_contents, raw_reasonings):
+                if c and r:
+                    merged.append(r + "\n\n" + c)
+                elif c:
+                    merged.append(c)
+                elif r:
+                    merged.append(r)
+                else:
+                    merged.append("")
+
+            reasonings: list[str | None] = [r if r != "" else None for r in raw_reasonings]
+
+            if not merged or merged[0] is None:
+                merged = [""]
+
+            cur_response = ModelResponse(
+                text=merged,
+                reasonings=reasonings,
+                input=context,
+            )
+            results.append(cur_response)
+
+    if len(results) != len(dataset):
+        raise RuntimeError(f"Internal mismatch: {len(results)} outputs vs {len(dataset)} docs.")
+
+    return dataset.get_original_order(results)
+
+# Re-apply caching decorator
+LiteLLMClient.greedy_until = cached(SamplingMethod.GENERATIVE)(_greedy_until_impl)
+PY
+    export PYTHONPATH="${patch_dir}:${PYTHONPATH:-}"
+}
+
+run_lighteval_eval() {
+    set +x
+    local port="${PORT:-8888}"
+    local task="${EVAL_TASK:-gsm8k}"
+    local num_fewshot="${NUM_FEWSHOT:-5}"
+    local results_dir="${EVAL_RESULT_DIR:-eval_out_lighteval}"
+    local max_samples=0
+
+    while [[ $# -gt 0 ]]; do
+        case $1 in
+            --port)        port="$2"; shift 2 ;;
+            --task)        task="$2"; shift 2 ;;
+            --num-fewshot) num_fewshot="$2"; shift 2 ;;
+            --results-dir) results_dir="$2"; shift 2 ;;
+            --max-samples) max_samples="$2"; shift 2 ;;
+            *)             echo "Unknown parameter: $1"; return 1 ;;
+        esac
+    done
+
+    _install_lighteval_deps
+    _patch_lighteval_litellm
+
+    # Prefer OPENAI_MODEL_NAME, then EVAL_MODEL_NAME, then MODEL
+    local model_name="${EVAL_MODEL_NAME:-${OPENAI_MODEL_NAME:-${MODEL}}}"
+    if [[ -z "$model_name" ]]; then
+        echo "Error: EVAL_MODEL_NAME / OPENAI_MODEL_NAME / MODEL not set for lighteval." >&2
+        return 1
+    fi
+
+    # LiteLLM provider prefix logic
+    local lite_model="$model_name"
+    if [[ "$lite_model" != openai/* ]]; then
+        lite_model="openai/${lite_model}"
+    fi
+
+    local base_url="http://0.0.0.0:${port}/v1"
+    export OPENAI_API_KEY="${OPENAI_API_KEY:-EMPTY}"
+
+    local MODEL_ARGS="model_name=${lite_model},base_url=${base_url},api_key=${OPENAI_API_KEY}"
+    local TASK_SPEC="${task}|${num_fewshot}"
+
+    set -x
+    lighteval endpoint litellm \
+        "${MODEL_ARGS}" \
+        "${TASK_SPEC}" \
+        --output-dir "/workspace/${results_dir}" \
+        --max-samples "${max_samples}" \
+        --remove-reasoning-tags
+    set +x
+}
+
+
+# ------------------------------
+# Unified eval entrypoint
+# ------------------------------
+
+run_eval() {
+    set +x
+    local framework="${EVAL_FRAMEWORK:-lm-eval}"
+    local forwarded=()
+
+    while [[ $# -gt 0 ]]; do
+        case "$1" in
+            --framework) framework="$2"; shift 2 ;;
+            *)           forwarded+=("$1"); shift ;;
+        esac
+    done
+
+    case "$framework" in
+        lm-eval|lm_eval) run_lm_eval "${forwarded[@]}" ;;
+        lighteval)       run_lighteval_eval "${forwarded[@]}" ;;
+        *)               echo "Unknown framework '${framework}'"; return 1 ;;
+    esac
+}
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh
index 63dcf76e1..777aa2c2d 100644
--- a/benchmarks/gptoss_fp4_mi300x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi300x_docker.sh
@@ -25,6 +25,7 @@ export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0
 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
 
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+MODEL_NAME=${MODEL##*/}
 
 set -x
 vllm serve $MODEL --port $PORT \
@@ -36,6 +37,7 @@ vllm serve $MODEL --port $PORT \
 --block-size=64 \
 --no-enable-prefix-caching \
 --disable-log-requests \
+--served-model-name $MODEL_NAME \
 --async-scheduling > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
@@ -47,7 +49,8 @@ source "$(dirname "$0")/benchmark_lib.sh"
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
 run_benchmark_serving \
-    --model "$MODEL" \
+    --model "$MODEL_NAME" \
+    --tokenizer "$MODEL" \
     --port "$PORT" \
     --backend vllm \
     --input-len "$ISL" \
@@ -59,6 +62,7 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-run_lm_eval --port "$PORT"
+run_eval --framework lm-eval --port "$PORT"
+run_eval --framework lighteval --task gsm8k --num-fewshot 5
 append_lm_eval_summary
 set +x
\ No newline at end of file

From 5ef76ef40780e39c6e238541b3abd17cce3c413a Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Thu, 20 Nov 2025 21:45:28 -0600
Subject: [PATCH 091/214] Lighteval 1.75

---
 .github/workflows/eval-gms8k.yml | 10 +++++++---
 benchmarks/benchmark_lib.sh      |  6 +++---
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 9ed618cda..cffb7277c 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -3,6 +3,11 @@ name: Eval - GSM8K (PoC)
 on:
   workflow_dispatch:
     inputs:
+      exp-name:
+        description: "Experiment name (prefix selects docker script)"
+        required: false
+        type: string
+        default: "gptoss_gsm8k_poc"
       image:
         description: "Serving image"
         required: false
@@ -36,7 +41,7 @@ on:
   push:
     paths:
       - '.github/workflows/eval-gms8k.yml'
-      - '.github/workflows/eval-tmpl.yml'
+      - '.github/workflows/eval-tmpl.yml' 
 
 jobs:
   eval:
@@ -48,7 +53,7 @@ jobs:
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm
       precision: fp4
-      exp-name: dsr1_gsm8k_poc
+      exp-name: ${{ inputs.exp-name || 'gptoss_gsm8k_poc' }}
       tp: '8'
       ep: '1'
       dp-attn: false
@@ -56,4 +61,3 @@ jobs:
       eval-task: gsm8k
       num-fewshot: ${{ inputs.num_fewshot || '5' }}
       limit: ${{ inputs.limit || '200' }}
-
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index bf96c8bda..9c33df709 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -274,7 +274,7 @@ run_lm_eval() {
       --num_fewshot "${num_fewshot}" \
       --batch_size "${batch_size}" \
       --output_path "/workspace/${results_dir}" \
-      --model_args "model=${MODEL},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=</s>,max_retries=3,num_concurrent=32,tokenized_requests=False" \
+      --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=</s>,max_retries=3,num_concurrent=32,tokenized_requests=False" \
       --gen_kwargs "max_tokens=${gen_max_tokens},temperature=${temperature},top_p=${top_p}"
     set +x
 }
@@ -488,9 +488,9 @@ run_lighteval_eval() {
     _patch_lighteval_litellm
 
     # Prefer OPENAI_MODEL_NAME, then EVAL_MODEL_NAME, then MODEL
-    local model_name="${EVAL_MODEL_NAME:-${OPENAI_MODEL_NAME:-${MODEL}}}"
+    local model_name="${MODEL_NAME}"
     if [[ -z "$model_name" ]]; then
-        echo "Error: EVAL_MODEL_NAME / OPENAI_MODEL_NAME / MODEL not set for lighteval." >&2
+        echo "Error: MODEL not set for lighteval." >&2
         return 1
     fi
 

From 30812413d38e632fad6109647d3027d704c4fd03 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Thu, 20 Nov 2025 21:59:36 -0600
Subject: [PATCH 092/214] Lighteval Mi325x

---
 .github/workflows/eval-gms8k.yml      |  2 +-
 benchmarks/gptoss_fp4_b200_docker.sh  | 12 ++++++++----
 benchmarks/gptoss_fp4_mi325x_slurm.sh | 13 +++++++------
 3 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index cffb7277c..0fca89d5b 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -48,7 +48,7 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: mi300x-amd_0
+      runner: mi325x-tw_1
       image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm
diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh
index 8b5b6c881..92e785663 100644
--- a/benchmarks/gptoss_fp4_b200_docker.sh
+++ b/benchmarks/gptoss_fp4_b200_docker.sh
@@ -45,11 +45,12 @@ export PYTHONNOUSERSITE=1
 export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1
 
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+MODEL_NAME=${MODEL##*/}
 
 set -x
 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \
 --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs 512 \
---disable-log-requests > $SERVER_LOG 2>&1 &
+--disable-log-requests --served-model-name $MODEL_NAME > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -62,7 +63,8 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 pip install -q datasets pandas
 
 run_benchmark_serving \
-    --model "$MODEL" \
+    --model "$MODEL_NAME" \
+    --tokenizer "$MODEL" \
     --port "$PORT" \
     --backend vllm \
     --input-len "$ISL" \
@@ -74,5 +76,7 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-run_lm_eval --port "$PORT"
-append_lm_eval_summary
\ No newline at end of file
+run_eval --framework lm-eval --port "$PORT"
+run_eval --framework lighteval --task gsm8k --num-fewshot 5
+append_lm_eval_summary
+set +x
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh
index 4219d0662..3394bcc04 100644
--- a/benchmarks/gptoss_fp4_mi325x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh
@@ -33,9 +33,8 @@ fi
 export VLLM_USE_AITER_UNIFIED_ATTENTION=1
 export VLLM_ROCM_USE_AITER_MHA=0
 export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0
+MODEL_NAME=${MODEL##*/}
 
-#
-## Start up vllm server
 set -x
 vllm serve $MODEL --port $PORT \
 --tensor-parallel-size=$TP \
@@ -46,8 +45,8 @@ vllm serve $MODEL --port $PORT \
 --block-size=64 \
 --no-enable-prefix-caching \
 --disable-log-requests \
---async-scheduling \
-> $SERVER_LOG 2>&1 &
+--served-model-name $MODEL_NAME \
+--async-scheduling > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -58,7 +57,8 @@ source "$(dirname "$0")/benchmark_lib.sh"
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
 run_benchmark_serving \
-    --model "$MODEL" \
+    --model "$MODEL_NAME" \
+    --tokenizer "$MODEL" \
     --port "$PORT" \
     --backend vllm \
     --input-len "$ISL" \
@@ -70,6 +70,7 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-run_lm_eval --port "$PORT"
+run_eval --framework lm-eval --port "$PORT"
+run_eval --framework lighteval --task gsm8k --num-fewshot 5
 append_lm_eval_summary
 set +x
\ No newline at end of file

From f182319a122bb2482dc0f31bfbc6c2aed516b493 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 21 Nov 2025 07:19:13 -0600
Subject: [PATCH 093/214] Lighteval Mi300x CR

---
 .github/workflows/eval-gms8k.yml |  2 +-
 benchmarks/benchmark_lib.sh      | 32 ++++++++++++++++++++++----------
 2 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 0fca89d5b..fc284160c 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -48,7 +48,7 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: mi325x-tw_1
+      runner: mi300x-cr_0
       image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 9c33df709..efddc3230 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -283,16 +283,28 @@ append_lm_eval_summary() {
     set +x
     local results_dir="${EVAL_RESULT_DIR:-eval_out}"
     local task="${EVAL_TASK:-gsm8k}"
+    # Always render a local summary so the runner can pick it up
+    local out_dir="/workspace/${results_dir}"
+    local summary_md="${out_dir}/SUMMARY.md"
+    mkdir -p "$out_dir" || true
+
+    python3 utils/lm_eval_to_md.py \
+        --results-dir "$out_dir" \
+        --task "${task}" \
+        --framework "${FRAMEWORK}" \
+        --precision "${PRECISION}" \
+        --tp "${TP:-1}" \
+        --ep "${EP_SIZE:-1}" \
+        --dp-attention "${DP_ATTENTION:-false}" \
+        > "$summary_md" || true
+
+    # If running inside a GitHub Actions step on this same machine, append there too
     if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
-        python3 utils/lm_eval_to_md.py \
-            --results-dir "/workspace/${results_dir}" \
-            --task "${task}" \
-            --framework "${FRAMEWORK}" \
-            --precision "${PRECISION}" \
-            --tp "${TP:-1}" \
-            --ep "${EP_SIZE:-1}" \
-            --dp-attention "${DP_ATTENTION:-false}" \
-            >> "$GITHUB_STEP_SUMMARY" || true
+        local GH_SUM_DIR
+        GH_SUM_DIR="$(dirname "$GITHUB_STEP_SUMMARY")"
+        if [ -d "$GH_SUM_DIR" ] && [ -w "$GH_SUM_DIR" ]; then
+            cat "$summary_md" >> "$GITHUB_STEP_SUMMARY" || true
+        fi
     fi
 }
 
@@ -538,4 +550,4 @@ run_eval() {
         lighteval)       run_lighteval_eval "${forwarded[@]}" ;;
         *)               echo "Unknown framework '${framework}'"; return 1 ;;
     esac
-}
\ No newline at end of file
+}

From 5ba2cf2691cc4af5832ddf6c9fcc8a513e209d0e Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 21 Nov 2025 07:51:23 -0600
Subject: [PATCH 094/214] Lighteval Mi355x amd

---
 .github/workflows/eval-gms8k.yml       | 2 +-
 benchmarks/gptoss_fp4_mi355x_docker.sh | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index fc284160c..1ef8ea3bf 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -48,7 +48,7 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: mi300x-cr_0
+      runner: mi355x-amd_4
       image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm
diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh
index f63cc9960..a413acd69 100644
--- a/benchmarks/gptoss_fp4_mi355x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi355x_docker.sh
@@ -23,6 +23,7 @@ export VLLM_ROCM_USE_AITER_MHA=0
 export VLLM_ROCM_USE_AITER_FUSED_MOE_A16W4=1
 
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+MODEL_NAME=${MODEL##*/}
 
 set -x
 vllm serve $MODEL --port $PORT \
@@ -34,6 +35,7 @@ vllm serve $MODEL --port $PORT \
 --block-size=64 \
 --no-enable-prefix-caching \
 --disable-log-requests \
+--served-model-name $MODEL_NAME \
 --async-scheduling > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
@@ -45,7 +47,8 @@ source "$(dirname "$0")/benchmark_lib.sh"
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
 run_benchmark_serving \
-    --model "$MODEL" \
+    --model "$MODEL_NAME" \
+    --tokenizer "$MODEL" \
     --port "$PORT" \
     --backend vllm \
     --input-len "$ISL" \
@@ -57,6 +60,7 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-run_lm_eval --port "$PORT"
+run_eval --framework lm-eval --port "$PORT"
+run_eval --framework lighteval --task gsm8k --num-fewshot 5
 append_lm_eval_summary
 set +x
\ No newline at end of file

From 5bf69ab6d9678c98e10c011abe579f79e4ee8fab Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 21 Nov 2025 07:54:46 -0600
Subject: [PATCH 095/214] Lighteval b200_nvd

---
 .github/workflows/eval-gms8k.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 1ef8ea3bf..60551abf5 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -48,8 +48,8 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: mi355x-amd_4
-      image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }}
+      runner: b200-nvd_0
+      image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm
       precision: fp4

From f862689b444999f4f6af87d15ac1239a748db263 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 21 Nov 2025 08:00:44 -0600
Subject: [PATCH 096/214] Lighteval h200_cr0

---
 .github/workflows/eval-gms8k.yml     |  2 +-
 benchmarks/gptoss_fp4_b200_docker.sh | 10 +++++++---
 benchmarks/gptoss_fp4_h100_docker.sh | 14 +++++++++-----
 3 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 60551abf5..ee15335e2 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -48,7 +48,7 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: b200-nvd_0
+      runner: h100-cr_0
       image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm
diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh
index 92e785663..1f4679383 100644
--- a/benchmarks/gptoss_fp4_b200_docker.sh
+++ b/benchmarks/gptoss_fp4_b200_docker.sh
@@ -48,9 +48,13 @@ SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 MODEL_NAME=${MODEL##*/}
 
 set -x
-vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \
---gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs 512 \
---disable-log-requests --served-model-name $MODEL_NAME > $SERVER_LOG 2>&1 &
+vllm serve $MODEL --host 0.0.0.0 --port $PORT \
+--config config.yaml \
+--gpu-memory-utilization 0.9 \
+--tensor-parallel-size $TP \
+--max-num-seqs 512 \
+--disable-log-requests \
+--served-model-name $MODEL_NAME > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh
index 212059b04..e9c1cfc5a 100644
--- a/benchmarks/gptoss_fp4_h100_docker.sh
+++ b/benchmarks/gptoss_fp4_h100_docker.sh
@@ -26,15 +26,16 @@ EOF
 
 export PYTHONNOUSERSITE=1
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+MODEL_NAME=${MODEL##*/}
 
-# Start server in the background, shld be openai/gpt-oss-120b
 set -x
 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
 --config config.yaml \
 --gpu-memory-utilization=0.9 \
 --tensor-parallel-size=$TP \
 --max-num-seqs=$CONC  \
---disable-log-requests > $SERVER_LOG 2>&1 &
+--disable-log-requests \
+--served-model-name $MODEL_NAME > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -47,7 +48,8 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 pip install -q datasets pandas
 
 run_benchmark_serving \
-    --model "$MODEL" \
+    --model "$MODEL_NAME" \
+    --tokenizer "$MODEL" \
     --port "$PORT" \
     --backend vllm \
     --input-len "$ISL" \
@@ -59,5 +61,7 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-run_lm_eval --port "$PORT"
-append_lm_eval_summary
\ No newline at end of file
+run_eval --framework lm-eval --port "$PORT"
+run_eval --framework lighteval --task gsm8k --num-fewshot 5
+append_lm_eval_summary
+set +x
\ No newline at end of file

From c3df519fea9019208f1c04f4a8bfba4851c3c949 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 21 Nov 2025 08:03:45 -0600
Subject: [PATCH 097/214] Lighteval h200-nb_1

---
 .github/workflows/eval-gms8k.yml    |  2 +-
 benchmarks/gptoss_fp4_h200_slurm.sh | 20 ++++++++++++++------
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index ee15335e2..8cc9a6e42 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -48,7 +48,7 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: h100-cr_0
+      runner: h200-nb_1
       image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm
diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh
index 2c18d4d6a..9906b2fa5 100644
--- a/benchmarks/gptoss_fp4_h200_slurm.sh
+++ b/benchmarks/gptoss_fp4_h200_slurm.sh
@@ -41,12 +41,17 @@ EOF
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 export TORCH_CUDA_ARCH_LIST="9.0"
 PORT=$(( 8888 + $PORT_OFFSET ))
+MODEL_NAME=${MODEL##*/}
 
 export TORCH_CUDA_ARCH_LIST="9.0"
 
-PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \
- --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC  \
- --disable-log-requests > $SERVER_LOG 2>&1 &
+PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT \
+ --config config.yaml \
+ --gpu-memory-utilization 0.9 \
+ --tensor-parallel-size $TP \
+ --max-num-seqs $CONC  \
+ --disable-log-requests \
+ --served-model-name $MODEL_NAME > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -57,7 +62,8 @@ source "$(dirname "$0")/benchmark_lib.sh"
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
 run_benchmark_serving \
-    --model "$MODEL" \
+    --model "$MODEL_NAME" \
+    --tokenizer "$MODEL" \
     --port "$PORT" \
     --backend vllm \
     --input-len "$ISL" \
@@ -69,5 +75,7 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-run_lm_eval --port "$PORT"
-append_lm_eval_summary
\ No newline at end of file
+run_eval --framework lm-eval --port "$PORT"
+run_eval --framework lighteval --task gsm8k --num-fewshot 5
+append_lm_eval_summary
+set +x
\ No newline at end of file

From c1edb9ac6c6a742fe61653a599c6fabc65ac8ece Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 21 Nov 2025 08:06:32 -0600
Subject: [PATCH 098/214] Lighteval h100-cw_1

---
 .github/workflows/eval-gms8k.yml    |  2 +-
 benchmarks/gptoss_fp4_h100_slurm.sh | 10 +++++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 8cc9a6e42..e980da0aa 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -48,7 +48,7 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: h200-nb_1
+      runner: h100-cw_1
       image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm
diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh
index b463a8aaf..394d68bc1 100644
--- a/benchmarks/gptoss_fp4_h100_slurm.sh
+++ b/benchmarks/gptoss_fp4_h100_slurm.sh
@@ -27,6 +27,7 @@ EOF
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 export TORCH_CUDA_ARCH_LIST="9.0"
 PORT=${PORT:-8888}
+MODEL_NAME=${MODEL##*/}
 
 set -x
 PYTHONNOUSERSITE=1 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
@@ -35,7 +36,7 @@ PYTHONNOUSERSITE=1 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
   --tensor-parallel-size=$TP \
   --max-num-seqs=$CONC  \
   --disable-log-requests \
-  > $SERVER_LOG 2>&1 &
+  --served-model-name $MODEL_NAME > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -48,7 +49,8 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 pip install -q datasets pandas
 
 run_benchmark_serving \
-    --model "$MODEL" \
+    --model "$MODEL_NAME" \
+    --tokenizer "$MODEL" \
     --port "$PORT" \
     --backend vllm \
     --input-len "$ISL" \
@@ -60,5 +62,7 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-run_lm_eval --port "$PORT"
+run_eval --framework lm-eval --port "$PORT"
+run_eval --framework lighteval --task gsm8k --num-fewshot 5
 append_lm_eval_summary
+set +x

From d21826b355a58a5b34ecf1dfe59f2b196be42ea0 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 21 Nov 2025 22:41:53 -0600
Subject: [PATCH 099/214] Error reproduction

---
 .github/workflows/eval-gms8k.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index e980da0aa..dfa643398 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -47,8 +47,8 @@ jobs:
   eval:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
-    with:
-      runner: h100-cw_1
+    with: 
+      runner: h200-nb_1
       image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm

From abdad78c7463e38610a387221ca5a0ac11c5a2ea Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 21 Nov 2025 23:02:36 -0600
Subject: [PATCH 100/214] Error file removal

---
 benchmarks/benchmark_lib.sh | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index efddc3230..e9c2f20d1 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -538,6 +538,10 @@ run_eval() {
     local framework="${EVAL_FRAMEWORK:-lm-eval}"
     local forwarded=()
 
+    # Defensive cleanup: remove any LiteLLM cache in the repo workspace so
+    # subsequent steps (e.g., actions/checkout) won't hit permission issues.
+    rm -rf .litellm_cache 2>/dev/null || true
+
     while [[ $# -gt 0 ]]; do
         case "$1" in
             --framework) framework="$2"; shift 2 ;;
@@ -550,4 +554,7 @@ run_eval() {
         lighteval)       run_lighteval_eval "${forwarded[@]}" ;;
         *)               echo "Unknown framework '${framework}'"; return 1 ;;
     esac
+
+    # Clean up again after eval, in case the tool recreated it.
+    rm -rf .litellm_cache 2>/dev/null || true
 }

From bd3653035bd9f0da0f1f81246dee5a2fbb37d8e1 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 22 Nov 2025 15:52:55 -0600
Subject: [PATCH 101/214] error reproducibility

---
 .github/workflows/eval-gms8k.yml | 6 +++---
 benchmarks/benchmark_lib.sh      | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index dfa643398..cffb7277c 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -47,9 +47,9 @@ jobs:
   eval:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
-    with: 
-      runner: h200-nb_1
-      image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
+    with:
+      runner: mi300x-amd_0
+      image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm
       precision: fp4
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index e9c2f20d1..52c972fc4 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -540,7 +540,7 @@ run_eval() {
 
     # Defensive cleanup: remove any LiteLLM cache in the repo workspace so
     # subsequent steps (e.g., actions/checkout) won't hit permission issues.
-    rm -rf .litellm_cache 2>/dev/null || true
+    #rm -rf .litellm_cache 2>/dev/null || true
 
     while [[ $# -gt 0 ]]; do
         case "$1" in
@@ -556,5 +556,5 @@ run_eval() {
     esac
 
     # Clean up again after eval, in case the tool recreated it.
-    rm -rf .litellm_cache 2>/dev/null || true
+    #rm -rf .litellm_cache 2>/dev/null || true
 }

From a0434b1ea5af218d9115281cd2cb483909d9afb8 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 22 Nov 2025 16:26:29 -0600
Subject: [PATCH 102/214] should NOT error reproduce

---
 .github/workflows/eval-tmpl.yml | 3 ++-
 benchmarks/benchmark_lib.sh     | 8 ++------
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml
index 2d2e8404a..2d21820c6 100644
--- a/.github/workflows/eval-tmpl.yml
+++ b/.github/workflows/eval-tmpl.yml
@@ -113,6 +113,7 @@ jobs:
           fi
           # Best-effort cleanup of prior eval outputs; do not block
           safe_timeout sudo rm -rf /home/ubuntu/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out/openai__gpt-oss-120b || true
+          safe_timeout sudo rm -rf /home/ubuntu/actions-runner/_work/InferenceMAX/InferenceMAX/.litellm_cache/cache.db || true
 
           if command -v squeue >/dev/null 2>&1; then
             echo "[Slurm] Cleaning up resources ..."
@@ -138,7 +139,7 @@ jobs:
         with:
           fetch-depth: 0
           # Avoid aggressive workspace deletion if stale, rely on git reset/clean later
-          clean: false
+          clean: true
 
       - name: Launch eval via runner script
         env:
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 52c972fc4..c91e95707 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -515,7 +515,7 @@ run_lighteval_eval() {
     local base_url="http://0.0.0.0:${port}/v1"
     export OPENAI_API_KEY="${OPENAI_API_KEY:-EMPTY}"
 
-    local MODEL_ARGS="model_name=${lite_model},base_url=${base_url},api_key=${OPENAI_API_KEY}"
+    local MODEL_ARGS="model_name=${lite_model},base_url=${base_url},api_key=${OPENAI_API_KEY},generation_parameters={temperature:1.0}"
     local TASK_SPEC="${task}|${num_fewshot}"
 
     set -x
@@ -538,10 +538,6 @@ run_eval() {
     local framework="${EVAL_FRAMEWORK:-lm-eval}"
     local forwarded=()
 
-    # Defensive cleanup: remove any LiteLLM cache in the repo workspace so
-    # subsequent steps (e.g., actions/checkout) won't hit permission issues.
-    #rm -rf .litellm_cache 2>/dev/null || true
-
     while [[ $# -gt 0 ]]; do
         case "$1" in
             --framework) framework="$2"; shift 2 ;;
@@ -556,5 +552,5 @@ run_eval() {
     esac
 
     # Clean up again after eval, in case the tool recreated it.
-    #rm -rf .litellm_cache 2>/dev/null || true
+    rm -rf .litellm_cache 2>/dev/null || true
 }

From f56a3117da9db7aa2367f1366a19dab39a717a86 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 22 Nov 2025 16:31:02 -0600
Subject: [PATCH 103/214] should NOT error reproduce

---
 .github/workflows/eval-tmpl.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml
index 2d21820c6..cbfd04faf 100644
--- a/.github/workflows/eval-tmpl.yml
+++ b/.github/workflows/eval-tmpl.yml
@@ -80,6 +80,7 @@ jobs:
     steps:
       - name: Resource cleanup
         run: |
+          sudo rm -rf /home/kimbosemianalysis/actions-runner/_work/InferenceMAX/InferenceMAX/.litellm_cache/
           # Helper to avoid indefinite hangs on flaky tools (Docker/Slurm)
           safe_timeout() {
             if command -v timeout >/dev/null 2>&1; then

From 27bd2de9691dab1c253c77f0ae7701dbc4a517fa Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 22 Nov 2025 16:33:34 -0600
Subject: [PATCH 104/214] should NOT error reproduce

---
 .github/workflows/eval-tmpl.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml
index cbfd04faf..bb9ec6fe0 100644
--- a/.github/workflows/eval-tmpl.yml
+++ b/.github/workflows/eval-tmpl.yml
@@ -81,6 +81,7 @@ jobs:
       - name: Resource cleanup
         run: |
           sudo rm -rf /home/kimbosemianalysis/actions-runner/_work/InferenceMAX/InferenceMAX/.litellm_cache/
+          sudo rm -rf /home/kimbosemianalysis/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out/
           # Helper to avoid indefinite hangs on flaky tools (Docker/Slurm)
           safe_timeout() {
             if command -v timeout >/dev/null 2>&1; then

From c058b1663c4bf430fbf194846a7dce4043a51c6f Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 22 Nov 2025 16:36:41 -0600
Subject: [PATCH 105/214] should NOT error reproduce

---
 .github/workflows/eval-tmpl.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml
index bb9ec6fe0..1930f0d2c 100644
--- a/.github/workflows/eval-tmpl.yml
+++ b/.github/workflows/eval-tmpl.yml
@@ -82,6 +82,7 @@ jobs:
         run: |
           sudo rm -rf /home/kimbosemianalysis/actions-runner/_work/InferenceMAX/InferenceMAX/.litellm_cache/
           sudo rm -rf /home/kimbosemianalysis/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out/
+          sudo rm -rf /home/kimbosemianalysis/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out_lighteval
           # Helper to avoid indefinite hangs on flaky tools (Docker/Slurm)
           safe_timeout() {
             if command -v timeout >/dev/null 2>&1; then

From 2e3691449a993c0e19d7a27146be18335b49ba46 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 23 Nov 2025 18:17:42 +0800
Subject: [PATCH 106/214] Double check other runner

---
 .github/workflows/eval-gms8k.yml | 4 ++--
 benchmarks/benchmark_lib.sh      | 2 --
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index cffb7277c..8cc9a6e42 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -48,8 +48,8 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: mi300x-amd_0
-      image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }}
+      runner: h200-nb_1
+      image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm
       precision: fp4
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index c91e95707..8692ec40d 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -551,6 +551,4 @@ run_eval() {
         *)               echo "Unknown framework '${framework}'"; return 1 ;;
     esac
 
-    # Clean up again after eval, in case the tool recreated it.
-    rm -rf .litellm_cache 2>/dev/null || true
 }

From d2cf0fbbeb12773db6edc19769ed2c28f301e0ce Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 23 Nov 2025 18:21:35 +0800
Subject: [PATCH 107/214] Cleanup MI300x_AMD

---
 .github/workflows/eval-gms8k.yml       |  4 +-
 benchmarks/benchmark_lib.sh            | 51 ++++++++++++++++++++++++++
 benchmarks/gptoss_fp4_mi300x_docker.sh |  3 ++
 3 files changed, 56 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 8cc9a6e42..cffb7277c 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -48,8 +48,8 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: h200-nb_1
-      image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
+      runner: mi300x-amd_0
+      image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm
       precision: fp4
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 8692ec40d..a52e7cc17 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -552,3 +552,54 @@ run_eval() {
     esac
 
 }
+
+# ...existing code...
+
+# ------------------------------
+# Cleanup utilities
+# ------------------------------
+
+# Clean up evaluation and cache artifacts
+# This function should be called at the end of benchmark/eval scripts
+cleanup_eval_artifacts() {
+    set +x
+    echo "[Cleanup] Removing evaluation artifacts and cache directories..."
+    
+    # Clean up litellm cache
+    if [ -d "/workspace/.litellm_cache" ]; then
+        rm -rf /workspace/.litellm_cache || true
+        echo "[Cleanup] Removed .litellm_cache"
+    fi
+    
+    # Clean up eval output directories
+    for dir in /workspace/eval_out* /workspace/.cache; do
+        if [ -d "$dir" ]; then
+            rm -rf "$dir" || true
+            echo "[Cleanup] Removed $dir"
+        fi
+    done
+    
+    # Clean up temporary benchmark directories
+    if [ -n "${BENCH_SERVING_DIR:-}" ] && [ -d "$BENCH_SERVING_DIR" ]; then
+        rm -rf "$BENCH_SERVING_DIR" || true
+        echo "[Cleanup] Removed benchmark serving temp dir"
+    fi
+    
+    # Clean up Python cache
+    find /workspace -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true
+    find /workspace -type f -name "*.pyc" -delete 2>/dev/null || true
+    
+    # Fix permissions for any remaining files (in case cleanup is run without sudo)
+    chmod -R 777 /workspace/.litellm_cache 2>/dev/null || true
+    chmod -R 777 /workspace/eval_out* 2>/dev/null || true
+    
+    echo "[Cleanup] Artifact cleanup complete"
+    set -x
+}
+
+# Trap to ensure cleanup runs even if script fails
+# Call this at the start of your benchmark scripts:
+# trap cleanup_eval_artifacts EXIT
+setup_cleanup_trap() {
+    trap cleanup_eval_artifacts EXIT INT TERM
+}
diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh
index 777aa2c2d..31ac5ba56 100644
--- a/benchmarks/gptoss_fp4_mi300x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi300x_docker.sh
@@ -61,6 +61,9 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
 
+# Auto cleanup on exit
+setup_cleanup_trap
+
 # After throughput, run evaluation (defaults to GSM8K)
 run_eval --framework lm-eval --port "$PORT"
 run_eval --framework lighteval --task gsm8k --num-fewshot 5

From 0a8901ab64e3d4f916189c5a927683aca0f90a17 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 23 Nov 2025 18:37:43 +0800
Subject: [PATCH 108/214] Cleanup MI300x_AMD

---
 .github/workflows/eval-tmpl.yml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml
index 1930f0d2c..2d21820c6 100644
--- a/.github/workflows/eval-tmpl.yml
+++ b/.github/workflows/eval-tmpl.yml
@@ -80,9 +80,6 @@ jobs:
     steps:
       - name: Resource cleanup
         run: |
-          sudo rm -rf /home/kimbosemianalysis/actions-runner/_work/InferenceMAX/InferenceMAX/.litellm_cache/
-          sudo rm -rf /home/kimbosemianalysis/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out/
-          sudo rm -rf /home/kimbosemianalysis/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out_lighteval
           # Helper to avoid indefinite hangs on flaky tools (Docker/Slurm)
           safe_timeout() {
             if command -v timeout >/dev/null 2>&1; then

From afd304fab4c7c5658700c8c014d68c16c4d0cfd2 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 23 Nov 2025 18:43:05 +0800
Subject: [PATCH 109/214] Cleanup MI300x_AMD

---
 .github/workflows/eval-tmpl.yml        |  8 ++++
 benchmarks/benchmark_lib.sh            | 54 +-------------------------
 benchmarks/gptoss_fp4_mi300x_docker.sh |  3 --
 3 files changed, 10 insertions(+), 55 deletions(-)

diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml
index 2d21820c6..195d0a158 100644
--- a/.github/workflows/eval-tmpl.yml
+++ b/.github/workflows/eval-tmpl.yml
@@ -159,3 +159,11 @@ jobs:
             ${{ env.EVAL_RESULT_DIR }}/
             ${{ env.EVAL_RESULT_DIR }}/*
             ${{ env.EVAL_RESULT_DIR }}/**
+
+      - name: Resource cleanup
+        run: |
+          safe_timeout sudo rm -rf \
+            "${{ github.workspace }}/.litellm_cache" \
+            "${{ github.workspace }}/eval_out"* \
+            "${{ github.workspace }}/.cache" \
+            || true
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index a52e7cc17..aaa9dbf88 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -523,6 +523,7 @@ run_lighteval_eval() {
         "${MODEL_ARGS}" \
         "${TASK_SPEC}" \
         --output-dir "/workspace/${results_dir}" \
+        --use-chat-template \
         --max-samples "${max_samples}" \
         --remove-reasoning-tags
     set +x
@@ -551,55 +552,4 @@ run_eval() {
         *)               echo "Unknown framework '${framework}'"; return 1 ;;
     esac
 
-}
-
-# ...existing code...
-
-# ------------------------------
-# Cleanup utilities
-# ------------------------------
-
-# Clean up evaluation and cache artifacts
-# This function should be called at the end of benchmark/eval scripts
-cleanup_eval_artifacts() {
-    set +x
-    echo "[Cleanup] Removing evaluation artifacts and cache directories..."
-    
-    # Clean up litellm cache
-    if [ -d "/workspace/.litellm_cache" ]; then
-        rm -rf /workspace/.litellm_cache || true
-        echo "[Cleanup] Removed .litellm_cache"
-    fi
-    
-    # Clean up eval output directories
-    for dir in /workspace/eval_out* /workspace/.cache; do
-        if [ -d "$dir" ]; then
-            rm -rf "$dir" || true
-            echo "[Cleanup] Removed $dir"
-        fi
-    done
-    
-    # Clean up temporary benchmark directories
-    if [ -n "${BENCH_SERVING_DIR:-}" ] && [ -d "$BENCH_SERVING_DIR" ]; then
-        rm -rf "$BENCH_SERVING_DIR" || true
-        echo "[Cleanup] Removed benchmark serving temp dir"
-    fi
-    
-    # Clean up Python cache
-    find /workspace -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true
-    find /workspace -type f -name "*.pyc" -delete 2>/dev/null || true
-    
-    # Fix permissions for any remaining files (in case cleanup is run without sudo)
-    chmod -R 777 /workspace/.litellm_cache 2>/dev/null || true
-    chmod -R 777 /workspace/eval_out* 2>/dev/null || true
-    
-    echo "[Cleanup] Artifact cleanup complete"
-    set -x
-}
-
-# Trap to ensure cleanup runs even if script fails
-# Call this at the start of your benchmark scripts:
-# trap cleanup_eval_artifacts EXIT
-setup_cleanup_trap() {
-    trap cleanup_eval_artifacts EXIT INT TERM
-}
+}
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh
index 31ac5ba56..777aa2c2d 100644
--- a/benchmarks/gptoss_fp4_mi300x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi300x_docker.sh
@@ -61,9 +61,6 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
 
-# Auto cleanup on exit
-setup_cleanup_trap
-
 # After throughput, run evaluation (defaults to GSM8K)
 run_eval --framework lm-eval --port "$PORT"
 run_eval --framework lighteval --task gsm8k --num-fewshot 5

From ef2ee409eec0576403aeaac19212d25fc13712b2 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 23 Nov 2025 19:09:33 +0800
Subject: [PATCH 110/214] Cleanup MI300x_AMD MUST WORK

---
 .github/workflows/eval-tmpl.yml | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml
index 195d0a158..3b8885124 100644
--- a/.github/workflows/eval-tmpl.yml
+++ b/.github/workflows/eval-tmpl.yml
@@ -80,6 +80,9 @@ jobs:
     steps:
       - name: Resource cleanup
         run: |
+          sudo rm -rf /home/kimbosemianalysis/actions-runner/_work/InferenceMAX/InferenceMAX/.litellm_cache/
+          sudo rm -rf /home/kimbosemianalysis/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out/
+          sudo rm -rf /home/kimbosemianalysis/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out_lighteval
           # Helper to avoid indefinite hangs on flaky tools (Docker/Slurm)
           safe_timeout() {
             if command -v timeout >/dev/null 2>&1; then
@@ -159,11 +162,11 @@ jobs:
             ${{ env.EVAL_RESULT_DIR }}/
             ${{ env.EVAL_RESULT_DIR }}/*
             ${{ env.EVAL_RESULT_DIR }}/**
-
+            
       - name: Resource cleanup
         run: |
-          safe_timeout sudo rm -rf \
+          sudo rm -rf \
             "${{ github.workspace }}/.litellm_cache" \
             "${{ github.workspace }}/eval_out"* \
             "${{ github.workspace }}/.cache" \
-            || true
+            || true
\ No newline at end of file

From 379069623766a95963fdff2e4acb554fc489f2a4 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 23 Nov 2025 19:18:58 +0800
Subject: [PATCH 111/214] works

---
 .github/workflows/eval-tmpl.yml | 5 +----
 benchmarks/benchmark_lib.sh     | 1 -
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml
index 3b8885124..27f9b8b3f 100644
--- a/.github/workflows/eval-tmpl.yml
+++ b/.github/workflows/eval-tmpl.yml
@@ -80,9 +80,6 @@ jobs:
     steps:
       - name: Resource cleanup
         run: |
-          sudo rm -rf /home/kimbosemianalysis/actions-runner/_work/InferenceMAX/InferenceMAX/.litellm_cache/
-          sudo rm -rf /home/kimbosemianalysis/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out/
-          sudo rm -rf /home/kimbosemianalysis/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out_lighteval
           # Helper to avoid indefinite hangs on flaky tools (Docker/Slurm)
           safe_timeout() {
             if command -v timeout >/dev/null 2>&1; then
@@ -162,7 +159,7 @@ jobs:
             ${{ env.EVAL_RESULT_DIR }}/
             ${{ env.EVAL_RESULT_DIR }}/*
             ${{ env.EVAL_RESULT_DIR }}/**
-            
+
       - name: Resource cleanup
         run: |
           sudo rm -rf \
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index aaa9dbf88..c0b8bbb0b 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -523,7 +523,6 @@ run_lighteval_eval() {
         "${MODEL_ARGS}" \
         "${TASK_SPEC}" \
         --output-dir "/workspace/${results_dir}" \
-        --use-chat-template \
         --max-samples "${max_samples}" \
         --remove-reasoning-tags
     set +x

From 92f244cb7b45fa8631debd3e64194d6916c24da1 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Tue, 25 Nov 2025 13:17:07 +0800
Subject: [PATCH 112/214] Working lighteval

---
 .github/workflows/eval-gms8k.yml |  8 ++--
 .github/workflows/eval-tmpl.yml  |  7 ++--
 benchmarks/benchmark_lib.sh      | 63 +++++++++++++++++---------------
 3 files changed, 40 insertions(+), 38 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index cffb7277c..34fe89d3c 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -22,7 +22,7 @@ on:
         description: "Tensor Parallel Size"
         required: false
         type: string
-        default: "8"
+        default: "2"
       port:
         description: "Server port"
         required: false
@@ -48,13 +48,13 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: mi300x-amd_0
-      image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }}
+      runner: h200-nb_1
+      image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm
       precision: fp4
       exp-name: ${{ inputs.exp-name || 'gptoss_gsm8k_poc' }}
-      tp: '8'
+      tp: '2'
       ep: '1'
       dp-attn: false
       port: ${{ inputs.port || '8888' }}
diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml
index 27f9b8b3f..63e568dd2 100644
--- a/.github/workflows/eval-tmpl.yml
+++ b/.github/workflows/eval-tmpl.yml
@@ -64,8 +64,7 @@ env:
   NUM_FEWSHOT: ${{ inputs['num-fewshot'] }}
   LIMIT: ${{ inputs.limit }}
   EVAL_RESULT_DIR: eval_out
-  # Server-side concurrency default (used by some server scripts)
-  CONC: '8'
+  CONC: '4'
   MAX_MODEL_LEN: '8192'
   ISL: 1024
   OSL: 1024
@@ -112,8 +111,8 @@ jobs:
             echo "[Docker] skipping docker cleanup on host $host"
           fi
           # Best-effort cleanup of prior eval outputs; do not block
-          safe_timeout sudo rm -rf /home/ubuntu/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out/openai__gpt-oss-120b || true
-          safe_timeout sudo rm -rf /home/ubuntu/actions-runner/_work/InferenceMAX/InferenceMAX/.litellm_cache/cache.db || true
+          sudo rm -rf /home/ubuntu/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out/openai__gpt-oss-120b || true
+          sudo rm -rf /home/ubuntu/actions-runner/_work/InferenceMAX/InferenceMAX/.litellm_cache/cache.db || true
 
           if command -v squeue >/dev/null 2>&1; then
             echo "[Slurm] Cleaning up resources ..."
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index c0b8bbb0b..6b9ac8f65 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -319,6 +319,8 @@ _install_lighteval_deps() {
 }
 
 # Patch lighteval's LiteLLMClient to handle reasoning content and Python name mangling
+# 1. Removed "response_format": {"type": "text"}, as it interferred with vllm endpoint
+# 2. Concat reasoning with output tokens as sometimes the output is empty.
 _patch_lighteval_litellm() {
     set +x
     local patch_dir
@@ -340,13 +342,12 @@ from lighteval.utils.cache_management import cached
 
 logger = logging.getLogger(__name__)
 
-# --- Patched __call_api: don't retry when we have reasoning_content ---
+# --- Patched __call_api: don't retry when we have reasoning_content, enforce chat template on vLLM and avoid stop interference ---
 def _patched___call_api(self, prompt, return_logits, max_new_tokens, num_samples, stop_sequence):  # noqa: C901
-    """Make API call with retries, but don't treat reasoning-only responses as empty."""
     from lighteval.models.endpoints.litellm_model import LitellmModelResponse
-
     response = LitellmModelResponse()
-    stop_sequence = self._prepare_stop_sequence(stop_sequence)
+
+    stop_sequence = None  # Important: let the chat template drive turn-taking
     max_new_tokens = self._prepare_max_new_tokens(max_new_tokens)
 
     if return_logits and not self.provider == "openai":
@@ -355,15 +356,18 @@ def _patched___call_api(self, prompt, return_logits, max_new_tokens, num_samples
     kwargs = {
         "model": self.model,
         "messages": prompt,
-        "response_format": {"type": "text"},
         "max_tokens": max_new_tokens,
         "logprobs": return_logits if self.provider == "openai" else None,
-        "stop": stop_sequence,
+        "stop": stop_sequence,  # disabled for chat
         "base_url": self.base_url,
         "api_key": self.api_key,
         "n": num_samples,
         "caching": True,
         "timeout": self.timeout,
+        # vLLM OpenAI server: ensure chat template is applied and an assistant turn is started
+        "extra_body": {
+            "use_chat_template": True
+        },
     }
 
     if "o1" in self.model:
@@ -381,6 +385,7 @@ def _patched___call_api(self, prompt, return_logits, max_new_tokens, num_samples
             content = msg.content
             reasoning = getattr(msg, "reasoning_content", None)
 
+            # Accept reasoning-only replies
             if (not content) and reasoning:
                 return response
 
@@ -388,14 +393,11 @@ def _patched___call_api(self, prompt, return_logits, max_new_tokens, num_samples
                 logger.info("Response is empty, retrying without caching")
                 kwargs["caching"] = False
                 response = litellm.completion(**kwargs)
-                msg = response.choices[0].message
-                content = msg.content
-                reasoning = getattr(msg, "reasoning_content", None)
 
             return response
         except litellm.BadRequestError as e:
             if "message" in e.__dict__ and "policy" in e.__dict__["message"]:
-                logger.warning(f"Content filtered. Returning empty response.")
+                logger.warning("Content filtered. Returning empty response.")
                 return LitellmModelResponse()
         except Exception as e:
             wait_time = min(64, self.API_RETRY_SLEEP * (self.API_RETRY_MULTIPLIER**attempt))
@@ -405,10 +407,9 @@ def _patched___call_api(self, prompt, return_logits, max_new_tokens, num_samples
     logger.error(f"API call failed after {self.API_MAX_RETRY} attempts.")
     return LitellmModelResponse()
 
-# APPLY PATCH: Must use mangled name because original was private (__call_api)
+# APPLY PATCH
 LiteLLMClient._LiteLLMClient__call_api = _patched___call_api
 
-# --- Patched greedy_until: merge reasoning + content, preserve ordering ---
 def _greedy_until_impl(self, docs: list[Doc]) -> list[ModelResponse]:
     dataset = GenerativeTaskDataset(requests=docs, num_dataset_splits=self.DATASET_SPLITS)
     results: list[ModelResponse] = []
@@ -420,7 +421,8 @@ def _greedy_until_impl(self, docs: list[Doc]) -> list[ModelResponse]:
         position=0,
         disable=self.disable_tqdm,
     ):
-        contexts = [self.prompt_manager.prepare_prompt_api(doc) for doc in dataset]
+        # FIX: only build contexts for the current split
+        contexts = [self.prompt_manager.prepare_prompt_api(doc) for doc in split]
 
         max_new_tokens = split[0].generation_size
         return_logits = split[0].use_logits
@@ -430,7 +432,6 @@ def _greedy_until_impl(self, docs: list[Doc]) -> list[ModelResponse]:
         if num_samples > 1 and self.generation_parameters.temperature == 0:
             raise ValueError("num_samples > 1 requires temperature > 0")
 
-        # CRITICAL FIX: Access the private method via mangled name
         responses = self._LiteLLMClient__call_api_parallel(
             contexts,
             return_logits,
@@ -443,28 +444,30 @@ def _greedy_until_impl(self, docs: list[Doc]) -> list[ModelResponse]:
             raw_contents = [(choice.message.content or "").strip() for choice in response.choices]
             raw_reasonings = [(getattr(choice.message, "reasoning_content", None) or "").strip() for choice in response.choices]
 
-            merged: list[str] = []
+            merged_texts: list[str] = []
+            reasonings: list[str | None] = []
+
             for c, r in zip(raw_contents, raw_reasonings):
                 if c and r:
-                    merged.append(r + "\n\n" + c)
+                    merged_texts.append(f"<think>{r}</think>\n\n{c}")
                 elif c:
-                    merged.append(c)
+                    merged_texts.append(c)
                 elif r:
-                    merged.append(r)
+                    merged_texts.append(f"<think>{r}</think>")
                 else:
-                    merged.append("")
-
-            reasonings: list[str | None] = [r if r != "" else None for r in raw_reasonings]
-
-            if not merged or merged[0] is None:
-                merged = [""]
-
-            cur_response = ModelResponse(
-                text=merged,
-                reasonings=reasonings,
-                input=context,
+                    merged_texts.append("")
+                reasonings.append(r if r != "" else None)
+
+            if not merged_texts or merged_texts[0] is None:
+                merged_texts = [""]
+
+            results.append(
+                ModelResponse(
+                    text=merged_texts,
+                    reasonings=reasonings,
+                    input=context,
+                )
             )
-            results.append(cur_response)
 
     if len(results) != len(dataset):
         raise RuntimeError(f"Internal mismatch: {len(results)} outputs vs {len(dataset)} docs.")

From 3e30425b4c9c3711d4f835fee4b2056fa3f1f95b Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Tue, 25 Nov 2025 16:11:35 +0800
Subject: [PATCH 113/214] lightevel fix

---
 .github/workflows/eval-gms8k.yml | 1 +
 benchmarks/benchmark_lib.sh      | 5 ++---
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 34fe89d3c..808ec37bc 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -42,6 +42,7 @@ on:
     paths:
       - '.github/workflows/eval-gms8k.yml'
       - '.github/workflows/eval-tmpl.yml' 
+      - 'benchmarks/benchmark_lib.sh'
 
 jobs:
   eval:
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 6b9ac8f65..020b03953 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -518,7 +518,7 @@ run_lighteval_eval() {
     local base_url="http://0.0.0.0:${port}/v1"
     export OPENAI_API_KEY="${OPENAI_API_KEY:-EMPTY}"
 
-    local MODEL_ARGS="model_name=${lite_model},base_url=${base_url},api_key=${OPENAI_API_KEY},generation_parameters={temperature:1.0}"
+    local MODEL_ARGS="model_name=${lite_model},base_url=${base_url},api_key=${OPENAI_API_KEY},generation_parameters={temperature:0.0,max_new_tokens:2056}"
     local TASK_SPEC="${task}|${num_fewshot}"
 
     set -x
@@ -526,8 +526,7 @@ run_lighteval_eval() {
         "${MODEL_ARGS}" \
         "${TASK_SPEC}" \
         --output-dir "/workspace/${results_dir}" \
-        --max-samples "${max_samples}" \
-        --remove-reasoning-tags
+        --max-samples "${max_samples}"
     set +x
 }
 

From 0d87ea5063dabb770496e0dea0692e229ef737cb Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Tue, 25 Nov 2025 16:49:17 +0800
Subject: [PATCH 114/214] lighteval test h100-cw_1

---
 .github/workflows/eval-gms8k.yml | 2 +-
 .github/workflows/eval-tmpl.yml  | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 808ec37bc..9d06c5623 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -49,7 +49,7 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: h200-nb_1
+      runner: h100-cw_1
       image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm
diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml
index 63e568dd2..23bec7006 100644
--- a/.github/workflows/eval-tmpl.yml
+++ b/.github/workflows/eval-tmpl.yml
@@ -111,9 +111,7 @@ jobs:
             echo "[Docker] skipping docker cleanup on host $host"
           fi
           # Best-effort cleanup of prior eval outputs; do not block
-          sudo rm -rf /home/ubuntu/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out/openai__gpt-oss-120b || true
-          sudo rm -rf /home/ubuntu/actions-runner/_work/InferenceMAX/InferenceMAX/.litellm_cache/cache.db || true
-
+        
           if command -v squeue >/dev/null 2>&1; then
             echo "[Slurm] Cleaning up resources ..."
             safe_timeout scancel -u "$USER" || true

From 00b1623ee9ff1b9516d1243f041af18d2822109a Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Tue, 25 Nov 2025 16:54:18 +0800
Subject: [PATCH 115/214] lighteval test h100-cr_1 + parsing

---
 .github/workflows/eval-gms8k.yml |   2 +-
 utils/lm_eval_to_md.py           | 247 ++++++++++++++++++++++++++-----
 2 files changed, 208 insertions(+), 41 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 9d06c5623..0951f4254 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -49,7 +49,7 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: h100-cw_1
+      runner: h100-cr_1
       image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm
diff --git a/utils/lm_eval_to_md.py b/utils/lm_eval_to_md.py
index dbcc4d88d..0c59bc494 100644
--- a/utils/lm_eval_to_md.py
+++ b/utils/lm_eval_to_md.py
@@ -1,10 +1,10 @@
 #!/usr/bin/env python3
 """
-Convert latest lm-evaluation-harness JSON in a results dir into a Markdown table
-for GitHub Actions job summary. Prints to stdout.
+Convert latest lm-evaluation-harness and/or lighteval JSONs in a results dir
+into Markdown tables for GitHub Actions job summary. Prints to stdout.
 
-Usage:
-  python3 bench_serving/scripts/lm_eval_to_md.py \
+Usage (same as before, works even if FRAMEWORK/PRECISION env vars are empty):
+  python3 utils/lm_eval_to_md.py \
     --results-dir /workspace/eval_out \
     --task gsm8k \
     --framework vLLM \
@@ -13,25 +13,28 @@
     --ep 1 \
     --dp-attention false
 """
-import argparse, json, os, re, sys
+import argparse
+import json
+import os
+import re
+import sys
 from collections import Counter
 from glob import glob
+from typing import Optional, Tuple, Dict, Any, List
 
-def find_latest_json(results_dir: str):
-    paths = []
-    for root, _, _ in os.walk(results_dir):
-        paths.extend(glob(os.path.join(root, "*.json")))
-    if not paths:
-        return None
-    paths.sort(key=lambda p: os.path.getmtime(p), reverse=True)
-    return paths[0]
+
+# -----------------------
+# Helpers
+# -----------------------
 
 def pct(x):
     return f"{x*100:.2f}%" if isinstance(x, (int, float)) else "N/A"
 
+
 def se(x):
     return f" \u00B1{(x*100):.2f}%" if isinstance(x, (int, float)) else ""
 
+
 def gpu_cpu_from_pretty_env(pe: str):
     if not isinstance(pe, str) or not pe:
         return "Unknown GPU"
@@ -42,20 +45,74 @@ def gpu_cpu_from_pretty_env(pe: str):
     cpu_line = next((l.split(":", 1)[1].strip() for l in pe.splitlines() if l.startswith("Model name:")), None)
     return gpu_summary + (f" ({cpu_line})" if cpu_line else "")
 
-def extract_metrics(data: dict, task: str):
-    # results section can vary across harness versions
+
+def detect_framework_kind(data: Dict[str, Any]) -> str:
+    """
+    Classify JSON as:
+      - 'lm-eval'   : lm-evaluation-harness style JSON
+      - 'lighteval' : lighteval JSON
+      - 'unknown'   : anything else
+    """
+    # lm-eval has lm_eval_version + results structure like results["gsm8k"]...  [oai_citation:0‡results_2025-11-25T08-30-41.513104.json](sediment://file_000000001658720790705168e4c51783)
+    if "lm_eval_version" in data or "pretty_env_info" in data:
+        return "lm-eval"
+    # lighteval has config_general + config_tasks/results keyed by "<task>|<k>"  [oai_citation:1‡results_2025-11-25T08-40-05.199875.json](sediment://file_000000006f3872078dd9c458c614c1f7)
+    if "config_general" in data and "results" in data:
+        return "lighteval"
+    return "unknown"
+
+
+def find_all_jsons(results_dir: str) -> List[str]:
+    paths = []
+    for root, _, _ in os.walk(results_dir):
+        for name in glob(os.path.join(root, "*.json")):
+            paths.append(name)
+    return paths
+
+
+def find_latest_by_kind(results_dir: str) -> Tuple[Optional[str], Optional[str]]:
+    """
+    Scan all JSONs under results_dir and return:
+      (latest_lm_eval_json_path, latest_lighteval_json_path)
+    """
+    lm_eval_candidates = []
+    lighteval_candidates = []
+
+    for path in find_all_jsons(results_dir):
+        try:
+            with open(path, "r") as f:
+                data = json.load(f)
+        except Exception:
+            continue
+
+        kind = detect_framework_kind(data)
+        mtime = os.path.getmtime(path)
+        if kind == "lm-eval":
+            lm_eval_candidates.append((mtime, path))
+        elif kind == "lighteval":
+            lighteval_candidates.append((mtime, path))
+
+    lm_path = max(lm_eval_candidates, default=(None, None))[1]
+    le_path = max(lighteval_candidates, default=(None, None))[1]
+    return lm_path, le_path
+
+
+# -----------------------
+# lm-eval parsing
+# -----------------------
+
+def extract_lm_eval_metrics(data: Dict[str, Any], task: str) -> Dict[str, Any]:
     res_all = data.get("results", {}) or {}
     res = res_all.get(task) if isinstance(res_all, dict) else {}
     if not res and isinstance(res_all, dict) and res_all:
-        # fallback to first key if requested task missing
         any_key = next(iter(res_all.keys()))
         res = res_all.get(any_key, {})
         task = any_key
 
     strict = res.get("exact_match,strict-match")
-    flex   = res.get("exact_match,flexible-extract")
+    flex = res.get("exact_match,flexible-extract")
     strict_se = res.get("exact_match_stderr,strict-match")
-    flex_se   = res.get("exact_match_stderr,flexible-extract")
+    flex_se = res.get("exact_match_stderr,flexible-extract")
 
     n_eff = None
     ns = data.get("n-samples") or data.get("n_samples") or {}
@@ -64,19 +121,16 @@ def extract_metrics(data: dict, task: str):
         if isinstance(tdict, dict):
             n_eff = tdict.get("effective") or tdict.get("n_eff")
 
-    # model/fewshot/limit are scattered depending on version
     model = data.get("model_name") \
         or data.get("configs", {}).get(task, {}).get("metadata", {}).get("model") \
         or data.get("config", {}).get("model") \
         or ""
 
-    # k-shot
     fewshot = None
     nshot = data.get("n-shot") or data.get("n_shot") or {}
     if isinstance(nshot, dict):
         fewshot = nshot.get(task) or nshot.get("gsm8k")
 
-    # limit
     limit = None
     cfg = data.get("config") or {}
     if isinstance(cfg, dict):
@@ -91,42 +145,155 @@ def extract_metrics(data: dict, task: str):
         "n_eff": n_eff,
         "model": model,
         "fewshot": fewshot,
-        "limit": limit
+        "limit": limit,
+    }
+
+
+def render_lm_eval_section(path: str,
+                           args,
+                           framework_label: str,
+                           precision_label: str) -> Tuple[str, Dict[str, Any]]:
+    with open(path, "r") as f:
+        data = json.load(f)
+
+    hardware = gpu_cpu_from_pretty_env(data.get("pretty_env_info", ""))
+    m = extract_lm_eval_metrics(data, args.task)
+
+    print(f"### {args.task} Evaluation (lm-eval-harness)\n")
+    print("| Hardware | Framework | Precision | TP | EP | DP Attention | EM Strict | EM Flexible | N (eff) |")
+    print("|---|---|---:|--:|--:|:--:|--:|--:|--:|")
+    print(
+        f"| {hardware} | {framework_label} | {precision_label} | {args.tp} | {args.ep} | "
+        f"{str(args.dp_attention).lower()} | "
+        f"{pct(m['strict'])}{se(m['strict_se'])} | "
+        f"{pct(m['flex'])}{se(m['flex_se'])} | {m['n_eff'] or ''} |"
+    )
+
+    lim = m["limit"]
+    lim_str = str(int(lim)) if isinstance(lim, (int, float)) else (str(lim) if lim is not None else "")
+    fewshot = m["fewshot"] if m["fewshot"] is not None else ""
+    print(
+        f"\n_Model_: `{m['model']}` &nbsp;&nbsp; "
+        f"_k-shot_: **{fewshot}** &nbsp;&nbsp; "
+        f"_limit_: **{lim_str}**  \n"
+        f"_Source_: `{os.path.basename(path)}`"
+    )
+    return hardware, m
+
+
+# -----------------------
+# lighteval parsing
+# -----------------------
+
+def extract_lighteval_metrics(data: Dict[str, Any], task_base: str) -> Dict[str, Any]:
+    res_all = data.get("results", {}) or {}
+
+    # Prefer task-specific key like "gsm8k|5" over "all"  [oai_citation:2‡results_2025-11-25T08-40-05.199875.json](sediment://file_000000006f3872078dd9c458c614c1f7)
+    task_key = None
+    for k in res_all.keys():
+        if k.startswith(task_base):
+            task_key = k
+            break
+    if task_key is None and "all" in res_all:
+        task_key = "all"
+
+    r = res_all.get(task_key, {})
+    em = r.get("extractive_match")
+    em_se = r.get("extractive_match_stderr")
+
+    # Fewshot & other metadata from config_tasks if available
+    fewshot = None
+    cfg_tasks = data.get("config_tasks", {})
+    if isinstance(cfg_tasks, dict) and task_key in cfg_tasks:
+        fewshot = cfg_tasks[task_key].get("num_fewshots")
+
+    # Model name from config_general
+    cg = data.get("config_general", {}) or {}
+    model = cg.get("model_name") or cg.get("model_config", {}).get("model_name", "")
+
+    return {
+        "task": task_key or task_base,
+        "em": em,
+        "em_se": em_se,
+        "fewshot": fewshot,
+        "model": model,
+        # lighteval JSON you showed doesn’t expose an obvious effective N; leave blank
+        "n_eff": None,
     }
 
+
+def render_lighteval_section(path: str,
+                             args,
+                             framework_label: str,
+                             precision_label: str,
+                             hardware_fallback: Optional[str]) -> None:
+    with open(path, "r") as f:
+        data = json.load(f)
+
+    m = extract_lighteval_metrics(data, args.task)
+    hardware = hardware_fallback or "Unknown GPU"
+
+    print(f"### {args.task} Evaluation (lighteval)\n")
+    print("| Hardware | Framework | Precision | TP | EP | DP Attention | Extractive Match | N (eff) |")
+    print("|---|---|---:|--:|--:|:--:|--:|--:|")
+    print(
+        f"| {hardware} | {framework_label} | {precision_label} | {args.tp} | {args.ep} | "
+        f"{str(args.dp_attention).lower()} | "
+        f"{pct(m['em'])}{se(m['em_se'])} | {m['n_eff'] or ''} |"
+    )
+
+    fewshot = m["fewshot"] if m["fewshot"] is not None else ""
+    print(
+        f"\n_Model_: `{m['model']}` &nbsp;&nbsp; "
+        f"_k-shot_: **{fewshot}**  \n"
+        f"_Source_: `{os.path.basename(path)}`"
+    )
+
+
+# -----------------------
+# main
+# -----------------------
+
 def main():
     ap = argparse.ArgumentParser()
     ap.add_argument("--results-dir", required=True)
     ap.add_argument("--task", default="gsm8k")
-    ap.add_argument("--framework", default=os.environ.get("FRAMEWORK", "vLLM"))
-    ap.add_argument("--precision", default=os.environ.get("PRECISION", "fp16"))
+    ap.add_argument("--framework", default=os.environ.get("FRAMEWORK", ""))
+    ap.add_argument("--precision", default=os.environ.get("PRECISION", ""))
     ap.add_argument("--tp", default=os.environ.get("TP", "1"))
     ap.add_argument("--ep", default=os.environ.get("EP_SIZE", "1"))
     ap.add_argument("--dp-attention", default=os.environ.get("DP_ATTENTION", "false"))
     args = ap.parse_args()
 
-    path = find_latest_json(args.results_dir)
-    print(f"### {args.task} Evaluation\n")
-    if not path or not os.path.exists(path):
+    # Robust defaults if env vars / CLI args are empty
+    framework_label = args.framework or os.environ.get("FRAMEWORK") or "unknown"
+    precision_label = args.precision or os.environ.get("PRECISION") or "unknown"
+
+    lm_path, le_path = find_latest_by_kind(args.results_dir)
+
+    if not lm_path and not le_path:
+        print(f"### {args.task} Evaluation\n")
         print(f"> No result JSON found in `{args.results_dir}`.")
         return
 
-    with open(path, "r") as f:
-        data = json.load(f)
+    hardware_from_lm = None
 
-    hardware = gpu_cpu_from_pretty_env(data.get("pretty_env_info", ""))
-    m = extract_metrics(data, args.task)
+    # 1) lm-eval section (if present)
+    if lm_path:
+        hardware_from_lm, _ = render_lm_eval_section(
+            lm_path, args, framework_label, precision_label
+        )
 
-    print("| Hardware | Framework | Precision | TP | EP | DP Attention | EM Strict | EM Flexible | N (eff) |")
-    print("|---|---|---:|--:|--:|:--:|--:|--:|--:|")
-    print(f"| {hardware} | {args.framework} | {args.precision} | {args.tp} | {args.ep} | {str(args.dp_attention).lower()} | "
-          f"{pct(m['strict'])}{se(m['strict_se'])} | {pct(m['flex'])}{se(m['flex_se'])} | {m['n_eff'] or ''} |")
+    # Spacer between sections if both exist
+    if lm_path and le_path:
+        print("\n")
+
+    # 2) lighteval section (if present)
+    if le_path:
+        render_lighteval_section(
+            le_path, args, framework_label, precision_label, hardware_from_lm
+        )
 
-    # metadata line
-    lim = m["limit"]
-    lim_str = str(int(lim)) if isinstance(lim, (int, float)) else (str(lim) if lim is not None else "")
-    fewshot = m["fewshot"] if m["fewshot"] is not None else ""
-    print(f"\n_Model_: `{m['model']}` &nbsp;&nbsp; _k-shot_: **{fewshot}** &nbsp;&nbsp; _limit_: **{lim_str}**  \n_Source_: `{os.path.basename(path)}`")
 
 if __name__ == "__main__":
     try:

From 83a71d22537ee05db9a72692b58f57170220b71b Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Tue, 25 Nov 2025 17:10:01 +0800
Subject: [PATCH 116/214] lighteval test b200_nvd

---
 .github/workflows/eval-gms8k.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 0951f4254..808ec37bc 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -49,7 +49,7 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: h100-cr_1
+      runner: h200-nb_1
       image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm

From df71abeb987e6857e2b1aa06ce4e994203ea48b9 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Tue, 25 Nov 2025 17:30:39 +0800
Subject: [PATCH 117/214] lighteval test b200_nvd

---
 .github/workflows/eval-gms8k.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 808ec37bc..cbcbe9423 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -49,7 +49,7 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: h200-nb_1
+      runner: b200_nvd-0
       image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm

From 4aa8d3446b5f3f5f06755bbd3b1765044f54ffc7 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Tue, 25 Nov 2025 17:37:27 +0800
Subject: [PATCH 118/214] lighteval test mi300x-amd_0

---
 .github/workflows/eval-gms8k.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index cbcbe9423..233f552f4 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -49,8 +49,8 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: b200_nvd-0
-      image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
+      runner: mi300x-amd_0
+      image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm
       precision: fp4

From fe2ecd5eaeb25a417e8d65662fe2440819bd0522 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Tue, 25 Nov 2025 17:40:55 +0800
Subject: [PATCH 119/214] lighteval test h100-cw_1

---
 .github/workflows/eval-gms8k.yml | 4 ++--
 benchmarks/benchmark_lib.sh      | 8 +++++---
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 233f552f4..9d06c5623 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -49,8 +49,8 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: mi300x-amd_0
-      image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }}
+      runner: h100-cw_1
+      image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm
       precision: fp4
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 020b03953..b0618c59a 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -474,8 +474,10 @@ def _greedy_until_impl(self, docs: list[Doc]) -> list[ModelResponse]:
 
     return dataset.get_original_order(results)
 
-# Re-apply caching decorator
-LiteLLMClient.greedy_until = cached(SamplingMethod.GENERATIVE)(_greedy_until_impl)
+# Disable lighteval on-disk caching to avoid filesystem issues with task names
+# like "gsm8k|5" becoming part of cache paths on certain filesystems.
+# We directly bind the greedy method without the caching decorator.
+LiteLLMClient.greedy_until = _greedy_until_impl
 PY
     export PYTHONPATH="${patch_dir}:${PYTHONPATH:-}"
 }
@@ -553,4 +555,4 @@ run_eval() {
         *)               echo "Unknown framework '${framework}'"; return 1 ;;
     esac
 
-}
\ No newline at end of file
+}

From fef016a51779ff50a3c5de2793dd660d9747823f Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Tue, 25 Nov 2025 17:42:27 +0800
Subject: [PATCH 120/214] lighteval test mi300x-cr_0

---
 .github/workflows/eval-gms8k.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 9d06c5623..82a588bf6 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -49,8 +49,8 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: h100-cw_1
-      image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
+      runner: mi300x-cr_0
+      image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm
       precision: fp4

From 124eb70ae63cb1c48ba55ac007ecfa090c8ba007 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Tue, 25 Nov 2025 17:58:24 +0800
Subject: [PATCH 121/214] lighteval test mi325x-tw_1

---
 .github/workflows/eval-gms8k.yml       | 2 +-
 benchmarks/gptoss_fp4_b200_docker.sh   | 2 +-
 benchmarks/gptoss_fp4_h100_docker.sh   | 2 +-
 benchmarks/gptoss_fp4_h100_slurm.sh    | 2 +-
 benchmarks/gptoss_fp4_h200_slurm.sh    | 2 +-
 benchmarks/gptoss_fp4_mi300x_docker.sh | 2 +-
 benchmarks/gptoss_fp4_mi325x_slurm.sh  | 2 +-
 benchmarks/gptoss_fp4_mi355x_docker.sh | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 82a588bf6..46d676b34 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -49,7 +49,7 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: mi300x-cr_0
+      runner: mi325x-tw_1
       image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm
diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh
index 1f4679383..f33e23bd6 100644
--- a/benchmarks/gptoss_fp4_b200_docker.sh
+++ b/benchmarks/gptoss_fp4_b200_docker.sh
@@ -80,7 +80,7 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-run_eval --framework lm-eval --port "$PORT"
+#run_eval --framework lm-eval --port "$PORT"
 run_eval --framework lighteval --task gsm8k --num-fewshot 5
 append_lm_eval_summary
 set +x
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh
index e9c1cfc5a..463f31b90 100644
--- a/benchmarks/gptoss_fp4_h100_docker.sh
+++ b/benchmarks/gptoss_fp4_h100_docker.sh
@@ -61,7 +61,7 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-run_eval --framework lm-eval --port "$PORT"
+#run_eval --framework lm-eval --port "$PORT"
 run_eval --framework lighteval --task gsm8k --num-fewshot 5
 append_lm_eval_summary
 set +x
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh
index 394d68bc1..19ad98294 100644
--- a/benchmarks/gptoss_fp4_h100_slurm.sh
+++ b/benchmarks/gptoss_fp4_h100_slurm.sh
@@ -62,7 +62,7 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-run_eval --framework lm-eval --port "$PORT"
+#run_eval --framework lm-eval --port "$PORT"
 run_eval --framework lighteval --task gsm8k --num-fewshot 5
 append_lm_eval_summary
 set +x
diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh
index 9906b2fa5..e5a3a6961 100644
--- a/benchmarks/gptoss_fp4_h200_slurm.sh
+++ b/benchmarks/gptoss_fp4_h200_slurm.sh
@@ -75,7 +75,7 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-run_eval --framework lm-eval --port "$PORT"
+#run_eval --framework lm-eval --port "$PORT"
 run_eval --framework lighteval --task gsm8k --num-fewshot 5
 append_lm_eval_summary
 set +x
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh
index 777aa2c2d..0dade438d 100644
--- a/benchmarks/gptoss_fp4_mi300x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi300x_docker.sh
@@ -62,7 +62,7 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-run_eval --framework lm-eval --port "$PORT"
+#run_eval --framework lm-eval --port "$PORT"
 run_eval --framework lighteval --task gsm8k --num-fewshot 5
 append_lm_eval_summary
 set +x
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh
index 3394bcc04..b3bdfbec7 100644
--- a/benchmarks/gptoss_fp4_mi325x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh
@@ -70,7 +70,7 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-run_eval --framework lm-eval --port "$PORT"
+#run_eval --framework lm-eval --port "$PORT"
 run_eval --framework lighteval --task gsm8k --num-fewshot 5
 append_lm_eval_summary
 set +x
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh
index a413acd69..305210cf7 100644
--- a/benchmarks/gptoss_fp4_mi355x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi355x_docker.sh
@@ -60,7 +60,7 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-run_eval --framework lm-eval --port "$PORT"
+#run_eval --framework lm-eval --port "$PORT"
 run_eval --framework lighteval --task gsm8k --num-fewshot 5
 append_lm_eval_summary
 set +x
\ No newline at end of file

From 2b0b9860e29fc700acbb7e1489199afe9307c726 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Tue, 25 Nov 2025 18:33:56 +0800
Subject: [PATCH 122/214] lighteval test mi355x-amd_4

---
 .github/workflows/eval-gms8k.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 46d676b34..af0baa997 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -49,7 +49,7 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: mi325x-tw_1
+      runner: mi355x-amd_5
       image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm

From dae73454a492bd2ac98e6026aa624b15fff975ec Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Tue, 25 Nov 2025 18:37:14 +0800
Subject: [PATCH 123/214] lighteval test b200-nvd_3

---
 .github/workflows/eval-gms8k.yml | 4 ++--
 .github/workflows/eval-tmpl.yml  | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index af0baa997..308212ca6 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -49,8 +49,8 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: mi355x-amd_5
-      image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }}
+      runner: b200-nvd_3
+      image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm
       precision: fp4
diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml
index 23bec7006..8b6537986 100644
--- a/.github/workflows/eval-tmpl.yml
+++ b/.github/workflows/eval-tmpl.yml
@@ -162,5 +162,4 @@ jobs:
           sudo rm -rf \
             "${{ github.workspace }}/.litellm_cache" \
             "${{ github.workspace }}/eval_out"* \
-            "${{ github.workspace }}/.cache" \
             || true
\ No newline at end of file

From 993b19f9d1e667200fdda2da874dc429150066c6 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Tue, 25 Nov 2025 19:55:54 +0800
Subject: [PATCH 124/214] lighteval test h100-cw_1 sudo test

---
 .github/workflows/eval-gms8k.yml |  2 +-
 .github/workflows/eval-tmpl.yml  | 14 ++++++++++----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 308212ca6..9d06c5623 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -49,7 +49,7 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: b200-nvd_3
+      runner: h100-cw_1
       image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm
diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml
index 8b6537986..62e37d553 100644
--- a/.github/workflows/eval-tmpl.yml
+++ b/.github/workflows/eval-tmpl.yml
@@ -79,6 +79,8 @@ jobs:
     steps:
       - name: Resource cleanup
         run: |
+          sudo mv /home/nvadmin/actions-runner/_work/InferenceMAX/InferenceMAX/.litellm_cache /tmp/ || true
+          sudo -n rm -rf /home/nvadmin/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out*
           # Helper to avoid indefinite hangs on flaky tools (Docker/Slurm)
           safe_timeout() {
             if command -v timeout >/dev/null 2>&1; then
@@ -159,7 +161,11 @@ jobs:
 
       - name: Resource cleanup
         run: |
-          sudo rm -rf \
-            "${{ github.workspace }}/.litellm_cache" \
-            "${{ github.workspace }}/eval_out"* \
-            || true
\ No newline at end of file
+          ls -lt
+          pkill -f litellm || true
+          sleep 2
+          if command -v fuser >/dev/null 2>&1; then
+            fuser -k /home/nvadmin/actions-runner/_work/InferenceMAX/InferenceMAX/.litellm_cache/cache.db 2>/dev/null || true
+          fi
+          sudo rm -rf .litellm_cache
+          sudo rm -rf eval_out*
\ No newline at end of file

From f5b3a7af372868b66619af8dacda9e0e05f2473a Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Tue, 25 Nov 2025 22:59:59 +0800
Subject: [PATCH 125/214] b200 fix check

---
 .github/workflows/eval-gms8k.yml    | 2 +-
 .github/workflows/eval-tmpl.yml     | 5 ++---
 benchmarks/benchmark_lib.sh         | 4 ++++
 benchmarks/gptoss_fp4_h200_slurm.sh | 2 +-
 4 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 9d06c5623..808ec37bc 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -49,7 +49,7 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: h100-cw_1
+      runner: h200-nb_1
       image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm
diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml
index 62e37d553..8db652350 100644
--- a/.github/workflows/eval-tmpl.yml
+++ b/.github/workflows/eval-tmpl.yml
@@ -79,8 +79,6 @@ jobs:
     steps:
       - name: Resource cleanup
         run: |
-          sudo mv /home/nvadmin/actions-runner/_work/InferenceMAX/InferenceMAX/.litellm_cache /tmp/ || true
-          sudo -n rm -rf /home/nvadmin/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out*
           # Helper to avoid indefinite hangs on flaky tools (Docker/Slurm)
           safe_timeout() {
             if command -v timeout >/dev/null 2>&1; then
@@ -161,7 +159,8 @@ jobs:
 
       - name: Resource cleanup
         run: |
-          ls -lt
+          ls -lt eval_out
+          ls -lt .litellm_cache
           pkill -f litellm || true
           sleep 2
           if command -v fuser >/dev/null 2>&1; then
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index b0618c59a..b5750495e 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -100,6 +100,7 @@ run_benchmark_serving() {
     fi
 
     set -x
+    echo "Before benchmark_serving: $(id -u) $(id -g) $(id -un)" >&2
     python3 "$BENCH_SERVING_DIR/benchmark_serving.py" \
         --model "$model" \
         --backend "$backend" \
@@ -269,6 +270,7 @@ run_lm_eval() {
     export OPENAI_API_KEY=${OPENAI_API_KEY:-EMPTY}
 
     set -x
+    echo "Before lm_eval: $(id -u) $(id -g) $(id -un)" >&2
     python3 -m lm_eval --model local-chat-completions --apply_chat_template \
       --tasks "${task}" \
       --num_fewshot "${num_fewshot}" \
@@ -524,6 +526,7 @@ run_lighteval_eval() {
     local TASK_SPEC="${task}|${num_fewshot}"
 
     set -x
+    echo "Before lighteval: $(id -u) $(id -g) $(id -un)" >&2
     lighteval endpoint litellm \
         "${MODEL_ARGS}" \
         "${TASK_SPEC}" \
@@ -555,4 +558,5 @@ run_eval() {
         *)               echo "Unknown framework '${framework}'"; return 1 ;;
     esac
 
+    ls -ld /workspace /workspace/eval_out* /workspace/results*
 }
diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh
index e5a3a6961..9906b2fa5 100644
--- a/benchmarks/gptoss_fp4_h200_slurm.sh
+++ b/benchmarks/gptoss_fp4_h200_slurm.sh
@@ -75,7 +75,7 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-#run_eval --framework lm-eval --port "$PORT"
+run_eval --framework lm-eval --port "$PORT"
 run_eval --framework lighteval --task gsm8k --num-fewshot 5
 append_lm_eval_summary
 set +x
\ No newline at end of file

From ff1eba60c1d7e2dfecb08ba02ebfb81afbf108f2 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Tue, 25 Nov 2025 23:00:23 +0800
Subject: [PATCH 126/214] b200 fix check

---
 .github/workflows/eval-gms8k.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 808ec37bc..308212ca6 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -49,7 +49,7 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: h200-nb_1
+      runner: b200-nvd_3
       image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm

From d6a52ec14e97dd840a594678144133933334452d Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Tue, 25 Nov 2025 23:01:28 +0800
Subject: [PATCH 127/214] b200 fix check

---
 .github/workflows/eval-tmpl.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml
index 8db652350..c14e8e509 100644
--- a/.github/workflows/eval-tmpl.yml
+++ b/.github/workflows/eval-tmpl.yml
@@ -87,6 +87,8 @@ jobs:
               "$@"
             fi
           }
+          sudo rm -rf /home/nvadmin/actions-runner/_work/InferenceMAX/InferenceMAX/.litellm_cache/
+          sudo rm -rf /home/nvadmin/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out*
 
           host=$(hostname)
           if [[ "$host" == "b200-81" || "$host" == "b200-80" || "$host" == "b200-79" ]]; then

From 4dd7e2162d6366b3bfb7e0483a7f06f10d50c253 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Tue, 25 Nov 2025 23:05:56 +0800
Subject: [PATCH 128/214] b200 fix check

---
 .github/workflows/eval-tmpl.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml
index c14e8e509..94f8ec395 100644
--- a/.github/workflows/eval-tmpl.yml
+++ b/.github/workflows/eval-tmpl.yml
@@ -87,9 +87,9 @@ jobs:
               "$@"
             fi
           }
-          sudo rm -rf /home/nvadmin/actions-runner/_work/InferenceMAX/InferenceMAX/.litellm_cache/
-          sudo rm -rf /home/nvadmin/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out*
-
+          set -x
+          sudo rm -rf $GITHUB_WORKSPACE
+          set +x
           host=$(hostname)
           if [[ "$host" == "b200-81" || "$host" == "b200-80" || "$host" == "b200-79" ]]; then
             if command -v docker >/dev/null 2>&1; then

From 37bd3df1c9d848bb3858e6a20b94d1f07288140d Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Tue, 25 Nov 2025 23:13:36 +0800
Subject: [PATCH 129/214] b200 fix check

---
 .github/workflows/eval-tmpl.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml
index 94f8ec395..c6b538548 100644
--- a/.github/workflows/eval-tmpl.yml
+++ b/.github/workflows/eval-tmpl.yml
@@ -88,7 +88,7 @@ jobs:
             fi
           }
           set -x
-          sudo rm -rf $GITHUB_WORKSPACE
+          sudo rm -rf ${GITHUB_WORKSPACE}/* || true
           set +x
           host=$(hostname)
           if [[ "$host" == "b200-81" || "$host" == "b200-80" || "$host" == "b200-79" ]]; then

From 43c7c595c94fdd14de3cbdf1869a35513b918abb Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Tue, 25 Nov 2025 23:15:21 +0800
Subject: [PATCH 130/214] b200 fix check

---
 .github/workflows/eval-tmpl.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml
index c6b538548..1491d74d3 100644
--- a/.github/workflows/eval-tmpl.yml
+++ b/.github/workflows/eval-tmpl.yml
@@ -88,7 +88,7 @@ jobs:
             fi
           }
           set -x
-          sudo rm -rf ${GITHUB_WORKSPACE}/* || true
+          sudo rm -rfv ${GITHUB_WORKSPACE}/* || true
           set +x
           host=$(hostname)
           if [[ "$host" == "b200-81" || "$host" == "b200-80" || "$host" == "b200-79" ]]; then

From e5a8e3ae30e849e8632e57e75655736bdb2b8bed Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Tue, 25 Nov 2025 23:22:38 +0800
Subject: [PATCH 131/214] b200 fix check

---
 .github/workflows/eval-tmpl.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml
index 1491d74d3..8501af2ea 100644
--- a/.github/workflows/eval-tmpl.yml
+++ b/.github/workflows/eval-tmpl.yml
@@ -87,9 +87,6 @@ jobs:
               "$@"
             fi
           }
-          set -x
-          sudo rm -rfv ${GITHUB_WORKSPACE}/* || true
-          set +x
           host=$(hostname)
           if [[ "$host" == "b200-81" || "$host" == "b200-80" || "$host" == "b200-79" ]]; then
             if command -v docker >/dev/null 2>&1; then
@@ -133,6 +130,9 @@ jobs:
               echo "[Slurm] Jobs still present after timeout; proceeding"
             fi
           fi
+          set -x
+          sudo rm -rfv ${GITHUB_WORKSPACE}/* || true
+          set +x
 
       - uses: actions/checkout@v5
         with:

From 8fb95f4ce84aa906f680eefd5883a29260ab045e Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Tue, 25 Nov 2025 23:29:02 +0800
Subject: [PATCH 132/214] b200 fix check

---
 .github/workflows/eval-gms8k.yml | 2 +-
 .github/workflows/eval-tmpl.yml  | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 308212ca6..2101ba972 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -49,7 +49,7 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: b200-nvd_3
+      runner: b200-nvd_1
       image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm
diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml
index 8501af2ea..4fb1f159c 100644
--- a/.github/workflows/eval-tmpl.yml
+++ b/.github/workflows/eval-tmpl.yml
@@ -130,9 +130,6 @@ jobs:
               echo "[Slurm] Jobs still present after timeout; proceeding"
             fi
           fi
-          set -x
-          sudo rm -rfv ${GITHUB_WORKSPACE}/* || true
-          set +x
 
       - uses: actions/checkout@v5
         with:

From 237b4e8e9ce03a8cb492ccc1aeba94a87fc5a410 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Tue, 25 Nov 2025 23:29:54 +0800
Subject: [PATCH 133/214] b200 fix check

---
 .github/workflows/eval-gms8k.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 2101ba972..3456c422d 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -49,7 +49,7 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: b200-nvd_1
+      runner: b200-nvd_2
       image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm

From 79eadc5d958a72898885ffed76530a0d885f9127 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Wed, 26 Nov 2025 19:45:24 +0800
Subject: [PATCH 134/214] Prelimary lighteval for all

---
 .github/workflows/eval-gms8k.yml        |  12 +--
 benchmarks/benchmark_lib.sh             | 135 +++++++++++++++++-------
 benchmarks/dsr1_fp4_b200_docker.sh      |   4 +-
 benchmarks/dsr1_fp4_mi355x_docker.sh    |   5 +-
 benchmarks/dsr1_fp8_b200_docker.sh      |   7 +-
 benchmarks/dsr1_fp8_h200_slurm.sh       |   4 +-
 benchmarks/dsr1_fp8_h200_trt_slurm.sh   |   5 +-
 benchmarks/dsr1_fp8_mi300x_docker.sh    |   7 +-
 benchmarks/dsr1_fp8_mi300x_slurm.sh     |   5 +
 benchmarks/dsr1_fp8_mi325x_docker.sh    |   4 +
 benchmarks/dsr1_fp8_mi325x_slurm.sh     |   7 +-
 benchmarks/dsr1_fp8_mi355x_docker.sh    |   4 +-
 benchmarks/dsr1_fp8_mi355x_slurm.sh     |   4 +-
 benchmarks/gptoss_fp4_h200_slurm.sh     |   2 +-
 benchmarks/gptoss_fp4_h200_trt_slurm.sh |   2 +-
 benchmarks/gptoss_fp4_mi300x_slurm.sh   |   2 +-
 benchmarks/gptoss_fp4_mi325x_docker.sh  |   2 +-
 benchmarks/gptoss_fp4_mi355x_slurm.sh   |   2 +-
 18 files changed, 151 insertions(+), 62 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 3456c422d..d1b33a6c1 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -49,12 +49,12 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: b200-nvd_2
-      image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
-      model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
-      framework: vllm
-      precision: fp4
-      exp-name: ${{ inputs.exp-name || 'gptoss_gsm8k_poc' }}
+      runner: mi325x-tw_1
+      image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' }}
+      model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }}
+      framework: sglang
+      precision: fp8
+      exp-name: ${{ inputs.exp-name || 'dsr1_gsm8k_poc' }}
       tp: '2'
       ep: '1'
       dp-attn: false
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index b5750495e..a7ab5c5c9 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -100,7 +100,6 @@ run_benchmark_serving() {
     fi
 
     set -x
-    echo "Before benchmark_serving: $(id -u) $(id -g) $(id -un)" >&2
     python3 "$BENCH_SERVING_DIR/benchmark_serving.py" \
         --model "$model" \
         --backend "$backend" \
@@ -270,7 +269,6 @@ run_lm_eval() {
     export OPENAI_API_KEY=${OPENAI_API_KEY:-EMPTY}
 
     set -x
-    echo "Before lm_eval: $(id -u) $(id -g) $(id -un)" >&2
     python3 -m lm_eval --model local-chat-completions --apply_chat_template \
       --tasks "${task}" \
       --num_fewshot "${num_fewshot}" \
@@ -329,27 +327,46 @@ _patch_lighteval_litellm() {
     patch_dir="$(mktemp -d)"
     cat > "$patch_dir/sitecustomize.py" <<'PY'
 import logging
+import os
 import time
+import re
+from concurrent.futures import ThreadPoolExecutor, as_completed
 
 import litellm
 from tqdm import tqdm
 
 litellm.suppress_debug_info = True
+litellm.drop_params = True
+
+# Remove sglang import that crashes
+try:
+    # This is where lighteval's is_package_available lives
+    from lighteval.utils import imports as le_imports
+except Exception:
+    le_imports = None
+else:
+    _orig_is_package_available = le_imports.is_package_available
+
+    def _patched_is_package_available(pkg: str) -> bool:
+        # Force "sglang" to look unavailable so that
+        # lighteval.models.sglang.sglang_model never imports `sglang`
+        if pkg == "sglang":
+            return False
+        return _orig_is_package_available(pkg)
+
+    le_imports.is_package_available = _patched_is_package_available
 
 from lighteval.models.endpoints.litellm_model import LiteLLMClient
 from lighteval.data import GenerativeTaskDataset
-from lighteval.tasks.requests import Doc, SamplingMethod
+from lighteval.tasks.requests import Doc
 from lighteval.models.model_output import ModelResponse
-from lighteval.utils.cache_management import cached
 
 logger = logging.getLogger(__name__)
 
-# --- Patched __call_api: don't retry when we have reasoning_content, enforce chat template on vLLM and avoid stop interference ---
-def _patched___call_api(self, prompt, return_logits, max_new_tokens, num_samples, stop_sequence):  # noqa: C901
+def _patched___call_api(self, prompt, return_logits, max_new_tokens, num_samples, stop_sequence):  # noqa: C901, N802
     from lighteval.models.endpoints.litellm_model import LitellmModelResponse
     response = LitellmModelResponse()
-
-    stop_sequence = None  # Important: let the chat template drive turn-taking
+    # Keep dataset-provided stop sequences to cut early
     max_new_tokens = self._prepare_max_new_tokens(max_new_tokens)
 
     if return_logits and not self.provider == "openai":
@@ -360,18 +377,22 @@ def _patched___call_api(self, prompt, return_logits, max_new_tokens, num_samples
         "messages": prompt,
         "max_tokens": max_new_tokens,
         "logprobs": return_logits if self.provider == "openai" else None,
-        "stop": stop_sequence,  # disabled for chat
+        "stop": stop_sequence,
         "base_url": self.base_url,
         "api_key": self.api_key,
         "n": num_samples,
-        "caching": True,
         "timeout": self.timeout,
-        # vLLM OpenAI server: ensure chat template is applied and an assistant turn is started
-        "extra_body": {
-            "use_chat_template": True
-        },
     }
 
+    # vLLM/SGLang OpenAI servers: apply chat template and start assistant turn
+    if (
+        self.provider == "openai"
+        and isinstance(self.base_url, str)
+        and self.base_url
+        and ("api.openai.com" not in self.base_url)
+    ):
+        kwargs["extra_body"] = {"use_chat_template": True, "add_generation_prompt": True}
+
     if "o1" in self.model:
         logger.warning("O1 models do not support temperature, top_p, stop sequence. Disabling.")
     else:
@@ -384,15 +405,15 @@ def _patched___call_api(self, prompt, return_logits, max_new_tokens, num_samples
         try:
             response = litellm.completion(**kwargs)
             msg = response.choices[0].message
-            content = msg.content
+            content = getattr(msg, "content", None)
             reasoning = getattr(msg, "reasoning_content", None)
 
             # Accept reasoning-only replies
             if (not content) and reasoning:
                 return response
 
-            if not content:
-                logger.info("Response is empty, retrying without caching")
+            if not content and LITELLM_CACHE:
+                logger.info("Empty content with caching on; retrying uncached once")
                 kwargs["caching"] = False
                 response = litellm.completion(**kwargs)
 
@@ -409,8 +430,49 @@ def _patched___call_api(self, prompt, return_logits, max_new_tokens, num_samples
     logger.error(f"API call failed after {self.API_MAX_RETRY} attempts.")
     return LitellmModelResponse()
 
-# APPLY PATCH
-LiteLLMClient._LiteLLMClient__call_api = _patched___call_api
+
+def _patched___call_api_parallel(self, prompts, return_logits, max_new_tokens, num_samples, stop_sequence):  # noqa: N802
+    # Build per-item args
+    return_logitss = [return_logits for _ in prompts] if not isinstance(return_logits, list) else return_logits
+    max_new_tokenss = [max_new_tokens for _ in prompts] if not isinstance(max_new_tokens, list) else max_new_tokens
+    num_sampless = [num_samples for _ in prompts] if not isinstance(num_samples, list) else num_samples
+    stop_sequencess = [stop_sequence for _ in prompts]
+
+    n = len(prompts)
+    assert n == len(return_logitss) == len(max_new_tokenss) == len(num_sampless) == len(stop_sequencess), (
+        f"Length mismatch: {len(prompts)}, {len(return_logitss)}, {len(max_new_tokenss)}, "
+        f"{len(num_sampless)}, {len(stop_sequencess)}"
+    )
+
+    results = [None] * n
+    with ThreadPoolExecutor(self.concurrent_requests) as executor:
+        futures = []
+        for idx in range(n):
+            fut = executor.submit(
+                self._LiteLLMClient__call_api,
+                prompts[idx],
+                return_logitss[idx],
+                max_new_tokenss[idx],
+                num_sampless[idx],
+                stop_sequencess[idx],
+            )
+            fut._le_idx = idx  # attach index for order restoration
+            futures.append(fut)
+
+        for fut in tqdm(as_completed(futures), total=n, disable=self.disable_tqdm):
+            idx = getattr(fut, "_le_idx", None)
+            try:
+                res = fut.result()
+            except Exception:
+                res = None
+            if idx is not None:
+                results[idx] = res
+
+    if any(r is None for r in results):
+        raise ValueError("Some entries are not annotated due to errors in __call_api_parallel, please inspect and retry.")
+
+    return results
+
 
 def _greedy_until_impl(self, docs: list[Doc]) -> list[ModelResponse]:
     dataset = GenerativeTaskDataset(requests=docs, num_dataset_splits=self.DATASET_SPLITS)
@@ -423,7 +485,6 @@ def _greedy_until_impl(self, docs: list[Doc]) -> list[ModelResponse]:
         position=0,
         disable=self.disable_tqdm,
     ):
-        # FIX: only build contexts for the current split
         contexts = [self.prompt_manager.prepare_prompt_api(doc) for doc in split]
 
         max_new_tokens = split[0].generation_size
@@ -443,22 +504,18 @@ def _greedy_until_impl(self, docs: list[Doc]) -> list[ModelResponse]:
         )
 
         for response, context in zip(responses, contexts):
-            raw_contents = [(choice.message.content or "").strip() for choice in response.choices]
-            raw_reasonings = [(getattr(choice.message, "reasoning_content", None) or "").strip() for choice in response.choices]
-
             merged_texts: list[str] = []
             reasonings: list[str | None] = []
 
-            for c, r in zip(raw_contents, raw_reasonings):
-                if c and r:
-                    merged_texts.append(f"<think>{r}</think>\n\n{c}")
-                elif c:
-                    merged_texts.append(c)
-                elif r:
-                    merged_texts.append(f"<think>{r}</think>")
-                else:
-                    merged_texts.append("")
-                reasonings.append(r if r != "" else None)
+            for choice in response.choices:
+                msg = choice.message
+                raw_content = getattr(msg, "content", None) or ""
+                reasoning = getattr(msg, "reasoning_content", None)
+
+                # For answer extraction, use only the content field
+                # The reasoning is stored separately for logging/debugging
+                merged_texts.append(raw_content.strip() if raw_content else "")
+                reasonings.append(reasoning if reasoning else None)
 
             if not merged_texts or merged_texts[0] is None:
                 merged_texts = [""]
@@ -476,10 +533,10 @@ def _greedy_until_impl(self, docs: list[Doc]) -> list[ModelResponse]:
 
     return dataset.get_original_order(results)
 
-# Disable lighteval on-disk caching to avoid filesystem issues with task names
-# like "gsm8k|5" becoming part of cache paths on certain filesystems.
-# We directly bind the greedy method without the caching decorator.
-LiteLLMClient.greedy_until = _greedy_until_impl
+# Bind patches
+LiteLLMClient._LiteLLMClient__call_api = _patched___call_api
+LiteLLMClient._LiteLLMClient__call_api_parallel = _patched___call_api_parallel
+#LiteLLMClient.greedy_until = _greedy_until_impl
 PY
     export PYTHONPATH="${patch_dir}:${PYTHONPATH:-}"
 }
@@ -522,11 +579,11 @@ run_lighteval_eval() {
     local base_url="http://0.0.0.0:${port}/v1"
     export OPENAI_API_KEY="${OPENAI_API_KEY:-EMPTY}"
 
-    local MODEL_ARGS="model_name=${lite_model},base_url=${base_url},api_key=${OPENAI_API_KEY},generation_parameters={temperature:0.0,max_new_tokens:2056}"
+
+    local MODEL_ARGS="model_name=${lite_model},base_url=${base_url},api_key=${OPENAI_API_KEY},generation_parameters={temperature:0.0,max_new_tokens:2048},concurrent_requests=8"
     local TASK_SPEC="${task}|${num_fewshot}"
 
     set -x
-    echo "Before lighteval: $(id -u) $(id -g) $(id -un)" >&2
     lighteval endpoint litellm \
         "${MODEL_ARGS}" \
         "${TASK_SPEC}" \
diff --git a/benchmarks/dsr1_fp4_b200_docker.sh b/benchmarks/dsr1_fp4_b200_docker.sh
index 656085fef..a2d0fb081 100644
--- a/benchmarks/dsr1_fp4_b200_docker.sh
+++ b/benchmarks/dsr1_fp4_b200_docker.sh
@@ -49,5 +49,7 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-run_lm_eval --port "$PORT"
+#run_lm_eval --port "$PORT"
+MODEL_NAME="openai/$MODEL"
+run_eval --framework lighteval --task gsm8k --num-fewshot 5
 append_lm_eval_summary
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp4_mi355x_docker.sh b/benchmarks/dsr1_fp4_mi355x_docker.sh
index f19b6df2e..b3f2a0e96 100644
--- a/benchmarks/dsr1_fp4_mi355x_docker.sh
+++ b/benchmarks/dsr1_fp4_mi355x_docker.sh
@@ -51,4 +51,7 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
 
-
+MODEL_NAME="openai/$MODEL"
+run_eval --framework lighteval --task gsm8k --num-fewshot 5
+append_lm_eval_summary
+set +x
diff --git a/benchmarks/dsr1_fp8_b200_docker.sh b/benchmarks/dsr1_fp8_b200_docker.sh
index e68397661..afb5c1655 100644
--- a/benchmarks/dsr1_fp8_b200_docker.sh
+++ b/benchmarks/dsr1_fp8_b200_docker.sh
@@ -60,5 +60,8 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-run_lm_eval --port "$PORT"
-append_lm_eval_summary
\ No newline at end of file
+#run_lm_eval --port "$PORT"
+MODEL_NAME="openai/$MODEL"
+run_eval --framework lighteval --task gsm8k --num-fewshot 5
+append_lm_eval_summary
+set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp8_h200_slurm.sh b/benchmarks/dsr1_fp8_h200_slurm.sh
index 8e8ec7469..4c75cc17e 100644
--- a/benchmarks/dsr1_fp8_h200_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_slurm.sh
@@ -65,5 +65,7 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-run_lm_eval --port "$PORT"
+#run_lm_eval --port "$PORT"
+MODEL_NAME="openai/$MODEL"
+run_eval --framework lighteval --task gsm8k --num-fewshot 5
 append_lm_eval_summary
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
index c829e66b5..797c2b67c 100644
--- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
@@ -91,5 +91,8 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-run_lm_eval --port "$PORT"
+#run_lm_eval --port "$PORT"
+MODEL_NAME="openai/$MODEL"
+run_eval --framework lighteval --task gsm8k --num-fewshot 5
 append_lm_eval_summary
+set +x
diff --git a/benchmarks/dsr1_fp8_mi300x_docker.sh b/benchmarks/dsr1_fp8_mi300x_docker.sh
index 3e604f3ca..cec263322 100644
--- a/benchmarks/dsr1_fp8_mi300x_docker.sh
+++ b/benchmarks/dsr1_fp8_mi300x_docker.sh
@@ -58,5 +58,8 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-run_lm_eval --port "$PORT"
-append_lm_eval_summary
\ No newline at end of file
+#run_lm_eval --port "$PORT"
+MODEL_NAME="openai/$MODEL"
+run_eval --framework lighteval --task gsm8k --num-fewshot 5
+append_lm_eval_summary
+set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp8_mi300x_slurm.sh b/benchmarks/dsr1_fp8_mi300x_slurm.sh
index 5fad7a587..f302c942f 100644
--- a/benchmarks/dsr1_fp8_mi300x_slurm.sh
+++ b/benchmarks/dsr1_fp8_mi300x_slurm.sh
@@ -66,3 +66,8 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+MODEL_NAME="openai/$MODEL"
+run_eval --framework lighteval --task gsm8k --num-fewshot 5
+append_lm_eval_summary
+set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp8_mi325x_docker.sh b/benchmarks/dsr1_fp8_mi325x_docker.sh
index 565b8fb45..ffe79da5c 100644
--- a/benchmarks/dsr1_fp8_mi325x_docker.sh
+++ b/benchmarks/dsr1_fp8_mi325x_docker.sh
@@ -48,3 +48,7 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
     
+MODEL_NAME="openai/$MODEL"
+run_eval --framework lighteval --task gsm8k --num-fewshot 5
+append_lm_eval_summary
+set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp8_mi325x_slurm.sh b/benchmarks/dsr1_fp8_mi325x_slurm.sh
index 4e66a64fb..4cfd8fdad 100644
--- a/benchmarks/dsr1_fp8_mi325x_slurm.sh
+++ b/benchmarks/dsr1_fp8_mi325x_slurm.sh
@@ -44,5 +44,8 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-run_lm_eval --port "$PORT"
-append_lm_eval_summary
\ No newline at end of file
+#run_lm_eval --port "$PORT"
+MODEL_NAME="openai/$MODEL"
+run_eval --framework lighteval --task gsm8k --num-fewshot 5
+append_lm_eval_summary
+set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp8_mi355x_docker.sh b/benchmarks/dsr1_fp8_mi355x_docker.sh
index 17e51344a..a5a6eee2a 100644
--- a/benchmarks/dsr1_fp8_mi355x_docker.sh
+++ b/benchmarks/dsr1_fp8_mi355x_docker.sh
@@ -59,6 +59,8 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-run_lm_eval --port "$PORT"
+#run_lm_eval --port "$PORT"
+MODEL_NAME="openai/$MODEL"
+run_eval --framework lighteval --task gsm8k --num-fewshot 5
 append_lm_eval_summary
 set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp8_mi355x_slurm.sh b/benchmarks/dsr1_fp8_mi355x_slurm.sh
index b16c8e247..fe64e8cdd 100644
--- a/benchmarks/dsr1_fp8_mi355x_slurm.sh
+++ b/benchmarks/dsr1_fp8_mi355x_slurm.sh
@@ -53,6 +53,8 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-run_lm_eval --port "$PORT"
+#run_lm_eval --port "$PORT"
+MODEL_NAME="openai/$MODEL"
+run_eval --framework lighteval --task gsm8k --num-fewshot 5
 append_lm_eval_summary
 set +x
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh
index 9906b2fa5..e5a3a6961 100644
--- a/benchmarks/gptoss_fp4_h200_slurm.sh
+++ b/benchmarks/gptoss_fp4_h200_slurm.sh
@@ -75,7 +75,7 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-run_eval --framework lm-eval --port "$PORT"
+#run_eval --framework lm-eval --port "$PORT"
 run_eval --framework lighteval --task gsm8k --num-fewshot 5
 append_lm_eval_summary
 set +x
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh
index 3c959a7b1..620ddfb42 100644
--- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh
+++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh
@@ -80,5 +80,5 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-run_lm_eval --port "$PORT"
+#run_lm_eval --port "$PORT"
 append_lm_eval_summary
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_mi300x_slurm.sh b/benchmarks/gptoss_fp4_mi300x_slurm.sh
index b0ba7db04..8dbeefcf2 100644
--- a/benchmarks/gptoss_fp4_mi300x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi300x_slurm.sh
@@ -71,6 +71,6 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-run_lm_eval --port "$PORT"
+#run_lm_eval --port "$PORT"
 append_lm_eval_summary
 set +x
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_mi325x_docker.sh b/benchmarks/gptoss_fp4_mi325x_docker.sh
index ccfe6e1c3..db330a1b8 100644
--- a/benchmarks/gptoss_fp4_mi325x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi325x_docker.sh
@@ -58,6 +58,6 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-run_lm_eval --port "$PORT"
+#run_lm_eval --port "$PORT"
 append_lm_eval_summary
 set +x
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_mi355x_slurm.sh b/benchmarks/gptoss_fp4_mi355x_slurm.sh
index 0dd860bb1..a999303b1 100644
--- a/benchmarks/gptoss_fp4_mi355x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi355x_slurm.sh
@@ -61,6 +61,6 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-run_lm_eval --port "$PORT"
+#run_lm_eval --port "$PORT"
 append_lm_eval_summary
 set +x
\ No newline at end of file

From a2d77ffdf28b07ff3c224b7177c52d9da200d693 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Wed, 26 Nov 2025 22:02:28 +0800
Subject: [PATCH 135/214] Prelimary lighteval for all 2 - fixed TP

---
 .github/workflows/eval-gms8k.yml | 2 +-
 .github/workflows/eval-tmpl.yml  | 7 ++-----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index d1b33a6c1..13556dc34 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -55,7 +55,7 @@ jobs:
       framework: sglang
       precision: fp8
       exp-name: ${{ inputs.exp-name || 'dsr1_gsm8k_poc' }}
-      tp: '2'
+      tp: '8'
       ep: '1'
       dp-attn: false
       port: ${{ inputs.port || '8888' }}
diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml
index 4fb1f159c..b77d57c5b 100644
--- a/.github/workflows/eval-tmpl.yml
+++ b/.github/workflows/eval-tmpl.yml
@@ -158,12 +158,9 @@ jobs:
 
       - name: Resource cleanup
         run: |
-          ls -lt eval_out
-          ls -lt .litellm_cache
           pkill -f litellm || true
+          pkill -f lighteval || true
           sleep 2
-          if command -v fuser >/dev/null 2>&1; then
-            fuser -k /home/nvadmin/actions-runner/_work/InferenceMAX/InferenceMAX/.litellm_cache/cache.db 2>/dev/null || true
-          fi
+
           sudo rm -rf .litellm_cache
           sudo rm -rf eval_out*
\ No newline at end of file

From 4e139a03643998c398157a482f0009262756cca3 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Wed, 26 Nov 2025 22:38:38 +0800
Subject: [PATCH 136/214] Prelimary lighteval for all 3

---
 .github/workflows/eval-gms8k.yml | 2 +-
 .github/workflows/eval-tmpl.yml  | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 13556dc34..239a91f2b 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -49,7 +49,7 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: mi325x-tw_1
+      runner: mi325x-tw_0
       image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' }}
       model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }}
       framework: sglang
diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml
index b77d57c5b..6e0d3bbb7 100644
--- a/.github/workflows/eval-tmpl.yml
+++ b/.github/workflows/eval-tmpl.yml
@@ -64,11 +64,11 @@ env:
   NUM_FEWSHOT: ${{ inputs['num-fewshot'] }}
   LIMIT: ${{ inputs.limit }}
   EVAL_RESULT_DIR: eval_out
-  CONC: '4'
-  MAX_MODEL_LEN: '8192'
+  CONC: '32'
+  MAX_MODEL_LEN: '4096'
   ISL: 1024
   OSL: 1024
-  RANDOM_RANGE_RATIO: '1.0'
+  RANDOM_RANGE_RATIO: '0.8'
   RESULT_FILENAME: results
   
 jobs:

From 76b8c2cf4a7743350ca4ce816a56ae16d0695d0a Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Thu, 27 Nov 2025 13:37:03 +0800
Subject: [PATCH 137/214] Fix lighteval 1

---
 benchmarks/benchmark_lib.sh         |  4 +++-
 benchmarks/dsr1_fp8_mi325x_slurm.sh |  2 +-
 utils/evals/custom_gsm8k.py         | 20 ++++++++++++++++++++
 3 files changed, 24 insertions(+), 2 deletions(-)
 create mode 100644 utils/evals/custom_gsm8k.py

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index a7ab5c5c9..4afb109d8 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -548,6 +548,7 @@ run_lighteval_eval() {
     local num_fewshot="${NUM_FEWSHOT:-5}"
     local results_dir="${EVAL_RESULT_DIR:-eval_out_lighteval}"
     local max_samples=0
+    local concurrent_requests=8
 
     while [[ $# -gt 0 ]]; do
         case $1 in
@@ -556,6 +557,7 @@ run_lighteval_eval() {
             --num-fewshot) num_fewshot="$2"; shift 2 ;;
             --results-dir) results_dir="$2"; shift 2 ;;
             --max-samples) max_samples="$2"; shift 2 ;;
+            --concurrent-requests) concurrent_requests="$2"; shift 2 ;;
             *)             echo "Unknown parameter: $1"; return 1 ;;
         esac
     done
@@ -580,7 +582,7 @@ run_lighteval_eval() {
     export OPENAI_API_KEY="${OPENAI_API_KEY:-EMPTY}"
 
 
-    local MODEL_ARGS="model_name=${lite_model},base_url=${base_url},api_key=${OPENAI_API_KEY},generation_parameters={temperature:0.0,max_new_tokens:2048},concurrent_requests=8"
+    local MODEL_ARGS="model_name=${lite_model},base_url=${base_url},api_key=${OPENAI_API_KEY},generation_parameters={temperature:0.0,max_new_tokens:2048},concurrent_requests=${concurrent_requests}"
     local TASK_SPEC="${task}|${num_fewshot}"
 
     set -x
diff --git a/benchmarks/dsr1_fp8_mi325x_slurm.sh b/benchmarks/dsr1_fp8_mi325x_slurm.sh
index 4cfd8fdad..b63495024 100644
--- a/benchmarks/dsr1_fp8_mi325x_slurm.sh
+++ b/benchmarks/dsr1_fp8_mi325x_slurm.sh
@@ -46,6 +46,6 @@ run_benchmark_serving \
 # After throughput, run evaluation (defaults to GSM8K)
 #run_lm_eval --port "$PORT"
 MODEL_NAME="openai/$MODEL"
-run_eval --framework lighteval --task gsm8k --num-fewshot 5
+run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC
 append_lm_eval_summary
 set +x
\ No newline at end of file
diff --git a/utils/evals/custom_gsm8k.py b/utils/evals/custom_gsm8k.py
new file mode 100644
index 000000000..4449188fa
--- /dev/null
+++ b/utils/evals/custom_gsm8k.py
@@ -0,0 +1,20 @@
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.tasks.gsm8k import gsm8k_prompt
+
+gsm8k_long = LightevalTaskConfig(
+    name="gsm8k_long",
+    prompt_function=gsm8k_prompt,
+    hf_repo="openai/gsm8k",
+    hf_subset="main",
+    hf_avail_splits=["train", "test"],
+    evaluation_splits=["test"],
+    few_shots_split=None,
+    few_shots_select="random_sampling_from_train",
+    generation_size=768,         # raise this as needed
+    metrics=[Metrics.expr_gold_metric],
+    stop_sequence=None,           # avoid early stop on "Question:"
+    version=0,
+)
+
+TASKS_TABLE = [gsm8k_long]

From fda8e2c30902e03b9e40345a6fb3e21c91125d64 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Thu, 27 Nov 2025 14:04:48 +0800
Subject: [PATCH 138/214] Check both

---
 .github/workflows/eval-gms8k.yml    | 2 +-
 benchmarks/benchmark_lib.sh         | 7 +++++--
 benchmarks/dsr1_fp8_mi325x_slurm.sh | 2 +-
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 239a91f2b..3e71d6885 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -61,4 +61,4 @@ jobs:
       port: ${{ inputs.port || '8888' }}
       eval-task: gsm8k
       num-fewshot: ${{ inputs.num_fewshot || '5' }}
-      limit: ${{ inputs.limit || '200' }}
+      limit: ${{ inputs.limit || '200' }} 
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 4afb109d8..cf05c6984 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -246,6 +246,7 @@ run_lm_eval() {
     local gen_max_tokens=4096
     local temperature=0
     local top_p=1
+    local concurrent_requests=32
 
     while [[ $# -gt 0 ]]; do
         case $1 in
@@ -257,6 +258,7 @@ run_lm_eval() {
             --gen-max-tokens) gen_max_tokens="$2"; shift 2 ;;
             --temperature)    temperature="$2"; shift 2 ;;
             --top-p)          top_p="$2"; shift 2 ;;
+            --concurrent-requests) concurrent_requests="$2"; shift 2 ;;
             *)                echo "Unknown parameter: $1"; return 1 ;;
         esac
     done
@@ -274,7 +276,7 @@ run_lm_eval() {
       --num_fewshot "${num_fewshot}" \
       --batch_size "${batch_size}" \
       --output_path "/workspace/${results_dir}" \
-      --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=</s>,max_retries=3,num_concurrent=32,tokenized_requests=False" \
+      --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=</s>,max_retries=3,num_concurrent=${concurrent_requests},tokenized_requests=False" \
       --gen_kwargs "max_tokens=${gen_max_tokens},temperature=${temperature},top_p=${top_p}"
     set +x
 }
@@ -548,7 +550,7 @@ run_lighteval_eval() {
     local num_fewshot="${NUM_FEWSHOT:-5}"
     local results_dir="${EVAL_RESULT_DIR:-eval_out_lighteval}"
     local max_samples=0
-    local concurrent_requests=8
+    local concurrent_requests=32
 
     while [[ $# -gt 0 ]]; do
         case $1 in
@@ -590,6 +592,7 @@ run_lighteval_eval() {
         "${MODEL_ARGS}" \
         "${TASK_SPEC}" \
         --output-dir "/workspace/${results_dir}" \
+        --custom-tasks utils/evals/custom_gsm8k.py \
         --max-samples "${max_samples}"
     set +x
 }
diff --git a/benchmarks/dsr1_fp8_mi325x_slurm.sh b/benchmarks/dsr1_fp8_mi325x_slurm.sh
index b63495024..641e2cde9 100644
--- a/benchmarks/dsr1_fp8_mi325x_slurm.sh
+++ b/benchmarks/dsr1_fp8_mi325x_slurm.sh
@@ -44,7 +44,7 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-#run_lm_eval --port "$PORT"
+run_lm_eval --port "$PORT" --concurrent-requests $CONC
 MODEL_NAME="openai/$MODEL"
 run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC
 append_lm_eval_summary

From 2e7c12783dfa0647e1655424c1e9e4f8681f5cff Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Thu, 27 Nov 2025 14:55:02 +0800
Subject: [PATCH 139/214] lm-eval check

---
 benchmarks/dsr1_fp8_mi325x_slurm.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/benchmarks/dsr1_fp8_mi325x_slurm.sh b/benchmarks/dsr1_fp8_mi325x_slurm.sh
index 641e2cde9..a19302f15 100644
--- a/benchmarks/dsr1_fp8_mi325x_slurm.sh
+++ b/benchmarks/dsr1_fp8_mi325x_slurm.sh
@@ -44,8 +44,9 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
+MODEL_NAME="$MODEL"
 run_lm_eval --port "$PORT" --concurrent-requests $CONC
-MODEL_NAME="openai/$MODEL"
-run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC
+#MODEL_NAME="openai/$MODEL"
+#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC
 append_lm_eval_summary
 set +x
\ No newline at end of file

From 867bfc3cdb1043e89db13e53e5428caaa042ade5 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Thu, 27 Nov 2025 14:55:41 +0800
Subject: [PATCH 140/214] lm-eval check

---
 .github/workflows/eval-gms8k.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 3e71d6885..4a12776af 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -49,7 +49,7 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: mi325x-tw_0
+      runner: mi325x-tw_0 
       image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' }}
       model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }}
       framework: sglang

From 8cbe81f4d8e33842fa4e1c88847837908564c0ea Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Thu, 27 Nov 2025 16:41:05 +0800
Subject: [PATCH 141/214] lm-eval check

---
 .github/workflows/eval-gms8k.yml    | 14 +++++++-------
 benchmarks/benchmark_lib.sh         |  2 --
 benchmarks/gptoss_fp4_h100_slurm.sh |  4 ++--
 3 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 4a12776af..eea18705c 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -49,13 +49,13 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: mi325x-tw_0 
-      image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' }}
-      model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }}
-      framework: sglang
-      precision: fp8
-      exp-name: ${{ inputs.exp-name || 'dsr1_gsm8k_poc' }}
-      tp: '8'
+      runner: h100-cw_1
+      image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
+      model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
+      framework: vllm
+      precision: fp4
+      exp-name: ${{ inputs.exp-name || 'gptoss_gsm8k_poc' }}
+      tp: '4'
       ep: '1'
       dp-attn: false
       port: ${{ inputs.port || '8888' }}
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index cf05c6984..d76b6fbd1 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -619,6 +619,4 @@ run_eval() {
         lighteval)       run_lighteval_eval "${forwarded[@]}" ;;
         *)               echo "Unknown framework '${framework}'"; return 1 ;;
     esac
-
-    ls -ld /workspace /workspace/eval_out* /workspace/results*
 }
diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh
index 19ad98294..ee2f37c4a 100644
--- a/benchmarks/gptoss_fp4_h100_slurm.sh
+++ b/benchmarks/gptoss_fp4_h100_slurm.sh
@@ -62,7 +62,7 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-#run_eval --framework lm-eval --port "$PORT"
-run_eval --framework lighteval --task gsm8k --num-fewshot 5
+run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( (CONC * 3 + 1)/2 ))
+#run_eval --framework lighteval --task gsm8k --num-fewshot 5
 append_lm_eval_summary
 set +x

From 1b3b79f75fbd7500938821fdafda118655515bff Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Thu, 27 Nov 2025 17:03:52 +0800
Subject: [PATCH 142/214] lm-eva l optimization

---
 benchmarks/benchmark_lib.sh         | 23 +++++++++++++++++++++++
 benchmarks/gptoss_fp4_h100_slurm.sh |  4 ++--
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index d76b6fbd1..f11fc0289 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -232,6 +232,29 @@ def _safe_mc_apply(self, resps, docs):
     return out
 
 ex.MultiChoiceRegexFilter.apply = _safe_mc_apply
+
+def _le_parse_generations(outputs, **kwargs):
+      res = []
+      if not isinstance(outputs, list):
+          outputs = [outputs]
+      for out in (outputs or []):
+          try:
+              choices = out.get("choices", [])
+              tmp = ["" for _ in choices]
+              for choice in choices:
+                  idx = choice.get("index", 0)
+                  msg = (choice.get("message") or {})
+                  content = msg.get("content")
+                  if content in (None, "", []):
+                      content = msg.get("reasoning_content") or ""
+                  tmp[idx] = content
+          except Exception:
+              tmp = [""]
+          res.extend(tmp)
+      return res
+
+# Keep staticmethod semantics
+_LCC.parse_generations = staticmethod(_le_parse_generations)
 PY
     export PYTHONPATH="${patch_dir}:${PYTHONPATH:-}"
 }
diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh
index ee2f37c4a..3c84888b9 100644
--- a/benchmarks/gptoss_fp4_h100_slurm.sh
+++ b/benchmarks/gptoss_fp4_h100_slurm.sh
@@ -56,13 +56,13 @@ run_benchmark_serving \
     --input-len "$ISL" \
     --output-len "$OSL" \
     --random-range-ratio "$RANDOM_RANGE_RATIO" \
-    --num-prompts $(( $CONC * 10 )) \
+    --num-prompts $(( $CONC * 1 )) \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( (CONC * 3 + 1)/2 ))
+run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) #$(( (CONC * 3 + 1)/2 ))
 #run_eval --framework lighteval --task gsm8k --num-fewshot 5
 append_lm_eval_summary
 set +x

From 65f03037b658ff6603d985384173b2e8b2bd0b4b Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 28 Nov 2025 00:39:18 +0800
Subject: [PATCH 143/214] mi325x test

---
 .github/workflows/eval-gms8k.yml      |  4 +--
 benchmarks/benchmark_lib.sh           | 10 ++----
 benchmarks/gptoss_fp4_mi325x_slurm.sh |  5 +--
 utils/evals/gsm8k.yaml                | 45 +++++++++++++++++++++++++++
 4 files changed, 53 insertions(+), 11 deletions(-)
 create mode 100644 utils/evals/gsm8k.yaml

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index eea18705c..79173b639 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -49,8 +49,8 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: h100-cw_1
-      image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
+      runner: mi325x-tw_1
+      image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }}
       model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
       framework: vllm
       precision: fp4
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index f11fc0289..29698e52e 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -265,7 +265,6 @@ run_lm_eval() {
     local task="${EVAL_TASK:-gsm8k}"
     local num_fewshot="${NUM_FEWSHOT:-5}"
     local results_dir="${EVAL_RESULT_DIR:-eval_out}"
-    local batch_size=3
     local gen_max_tokens=4096
     local temperature=0
     local top_p=1
@@ -277,7 +276,6 @@ run_lm_eval() {
             --task)           task="$2"; shift 2 ;;
             --num-fewshot)    num_fewshot="$2"; shift 2 ;;
             --results-dir)    results_dir="$2"; shift 2 ;;
-            --batch-size)     batch_size="$2"; shift 2 ;;
             --gen-max-tokens) gen_max_tokens="$2"; shift 2 ;;
             --temperature)    temperature="$2"; shift 2 ;;
             --top-p)          top_p="$2"; shift 2 ;;
@@ -295,11 +293,10 @@ run_lm_eval() {
 
     set -x
     python3 -m lm_eval --model local-chat-completions --apply_chat_template \
-      --tasks "${task}" \
+      --tasks "utils/evals/${task}.yaml" \
       --num_fewshot "${num_fewshot}" \
-      --batch_size "${batch_size}" \
       --output_path "/workspace/${results_dir}" \
-      --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=</s>,max_retries=3,num_concurrent=${concurrent_requests},tokenized_requests=False" \
+      --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=</s>,max_retries=2,num_concurrent=${concurrent_requests},tokenized_requests=False" \
       --gen_kwargs "max_tokens=${gen_max_tokens},temperature=${temperature},top_p=${top_p}"
     set +x
 }
@@ -606,8 +603,7 @@ run_lighteval_eval() {
     local base_url="http://0.0.0.0:${port}/v1"
     export OPENAI_API_KEY="${OPENAI_API_KEY:-EMPTY}"
 
-
-    local MODEL_ARGS="model_name=${lite_model},base_url=${base_url},api_key=${OPENAI_API_KEY},generation_parameters={temperature:0.0,max_new_tokens:2048},concurrent_requests=${concurrent_requests}"
+    local MODEL_ARGS="model_name=${lite_model},base_url=${base_url},api_key=${OPENAI_API_KEY},generation_parameters={temperature:0.0,top_p=1,max_new_tokens:2048},concurrent_requests=${concurrent_requests}"
     local TASK_SPEC="${task}|${num_fewshot}"
 
     set -x
diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh
index b3bdfbec7..bac78918d 100644
--- a/benchmarks/gptoss_fp4_mi325x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh
@@ -70,7 +70,8 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-#run_eval --framework lm-eval --port "$PORT"
-run_eval --framework lighteval --task gsm8k --num-fewshot 5
+MODEL_NAME="$MODEL"
+run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+#run_eval --framework lighteval --task gsm8k --num-fewshot 5
 append_lm_eval_summary
 set +x
\ No newline at end of file
diff --git a/utils/evals/gsm8k.yaml b/utils/evals/gsm8k.yaml
new file mode 100644
index 000000000..3d9e5ce3b
--- /dev/null
+++ b/utils/evals/gsm8k.yaml
@@ -0,0 +1,45 @@
+tag:
+  - math_word_problems
+task: gsm8k
+dataset_path: gsm8k
+dataset_name: main
+output_type: generate_until
+training_split: train
+fewshot_split: train
+test_split: test
+doc_to_text: "Question: {{question}}\nEnd your answer with: #### <answer>\nAnswer:"
+doc_to_target: "{{answer}}" #" {{answer.split('### ')[-1].rstrip()}}"
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: false
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+      - "(?s).*#### "
+      - "\\.$"
+generation_kwargs:
+  until:
+    - "Question:"
+    - "</s>"
+    - "<|im_end|>"
+  do_sample: false
+  temperature: 0.0
+repeats: 1
+num_fewshot: 5
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "#### (\\-?[0-9\\.\\,]+)"
+      - function: "take_first"
+  - name: "flexible-extract"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: "(-?[$0-9.,]{2,})|(-?[0-9]+)"
+      - function: "take_first"
+metadata:
+  version: 3.0

From ddd3862ff73f0b4970b7f06ecfccb8a1ccae4a5b Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 28 Nov 2025 00:51:36 +0800
Subject: [PATCH 144/214] mi325x test

---
 .github/workflows/eval-gms8k.yml      | 2 +-
 benchmarks/gptoss_fp4_mi325x_slurm.sh | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 79173b639..c4103fd0f 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -55,7 +55,7 @@ jobs:
       framework: vllm
       precision: fp4
       exp-name: ${{ inputs.exp-name || 'gptoss_gsm8k_poc' }}
-      tp: '4'
+      tp: '2'
       ep: '1'
       dp-attn: false
       port: ${{ inputs.port || '8888' }}
diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh
index bac78918d..40ed241f7 100644
--- a/benchmarks/gptoss_fp4_mi325x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh
@@ -70,7 +70,6 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-MODEL_NAME="$MODEL"
 run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
 #run_eval --framework lighteval --task gsm8k --num-fewshot 5
 append_lm_eval_summary

From 30ad3ba00effb5f089f49a81153b811c3b1a9330 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 28 Nov 2025 13:35:50 +0800
Subject: [PATCH 145/214] all change, test deepseek

---
 .github/workflows/eval-gms8k.yml        |  14 +--
 benchmarks/dsr1_fp4_b200_docker.sh      |   9 +-
 benchmarks/dsr1_fp4_b200_trt_slurm.sh   |  11 +-
 benchmarks/dsr1_fp4_mi355x_docker.sh    |   8 +-
 benchmarks/dsr1_fp4_mi355x_slurm.sh     | 143 +-----------------------
 benchmarks/dsr1_fp8_b200_docker.sh      |   6 +-
 benchmarks/dsr1_fp8_b200_trt_slurm.sh   |  11 +-
 benchmarks/dsr1_fp8_h200_slurm.sh       |   9 +-
 benchmarks/dsr1_fp8_h200_trt_slurm.sh   |  16 +--
 benchmarks/dsr1_fp8_mi300x_docker.sh    |   6 +-
 benchmarks/dsr1_fp8_mi300x_slurm.sh     |   6 +-
 benchmarks/dsr1_fp8_mi325x_docker.sh    |   6 +-
 benchmarks/dsr1_fp8_mi325x_slurm.sh     |   3 +-
 benchmarks/dsr1_fp8_mi355x_docker.sh    |   6 +-
 benchmarks/dsr1_fp8_mi355x_slurm.sh     |   6 +-
 benchmarks/gptoss_fp4_b200_docker.sh    |   2 +-
 benchmarks/gptoss_fp4_b200_trt_slurm.sh |  12 +-
 benchmarks/gptoss_fp4_h100_docker.sh    |   4 +-
 benchmarks/gptoss_fp4_h100_slurm.sh     |   2 +-
 benchmarks/gptoss_fp4_h200_slurm.sh     |   4 +-
 benchmarks/gptoss_fp4_h200_trt_slurm.sh |  20 +---
 benchmarks/gptoss_fp4_mi300x_docker.sh  |   4 +-
 benchmarks/gptoss_fp4_mi300x_slurm.sh   |   2 +-
 benchmarks/gptoss_fp4_mi325x_docker.sh  |   2 +-
 benchmarks/gptoss_fp4_mi325x_slurm.sh   |   9 +-
 benchmarks/gptoss_fp4_mi355x_docker.sh  |   4 +-
 benchmarks/gptoss_fp4_mi355x_slurm.sh   |   2 +-
 27 files changed, 86 insertions(+), 241 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index c4103fd0f..3e71d6885 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -49,13 +49,13 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: mi325x-tw_1
-      image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' }}
-      model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
-      framework: vllm
-      precision: fp4
-      exp-name: ${{ inputs.exp-name || 'gptoss_gsm8k_poc' }}
-      tp: '2'
+      runner: mi325x-tw_0
+      image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' }}
+      model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }}
+      framework: sglang
+      precision: fp8
+      exp-name: ${{ inputs.exp-name || 'dsr1_gsm8k_poc' }}
+      tp: '8'
       ep: '1'
       dp-attn: false
       port: ${{ inputs.port || '8888' }}
diff --git a/benchmarks/dsr1_fp4_b200_docker.sh b/benchmarks/dsr1_fp4_b200_docker.sh
index a2d0fb081..08469715e 100644
--- a/benchmarks/dsr1_fp4_b200_docker.sh
+++ b/benchmarks/dsr1_fp4_b200_docker.sh
@@ -49,7 +49,8 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-#run_lm_eval --port "$PORT"
-MODEL_NAME="openai/$MODEL"
-run_eval --framework lighteval --task gsm8k --num-fewshot 5
-append_lm_eval_summary
\ No newline at end of file
+run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+#MODEL_NAME="openai/$MODEL"
+#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC
+append_lm_eval_summary
+set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
index b4227e428..3700c3b40 100644
--- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
@@ -1,16 +1,13 @@
 #!/usr/bin/env bash
 
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
+# === Required Env Vars ===
 # MODEL
+# TP
+# CONC
 # ISL
 # OSL
 # MAX_MODEL_LEN
 # RANDOM_RANGE_RATIO
-# TP
-# CONC
 # RESULT_FILENAME
 # PORT_OFFSET
 # DP_ATTENTION
@@ -118,4 +115,4 @@ run_benchmark_serving \
     --num-prompts $(( $CONC * 10 )) \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
-    --result-dir /workspace/
+    --result-dir /workspace/
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp4_mi355x_docker.sh b/benchmarks/dsr1_fp4_mi355x_docker.sh
index b3f2a0e96..2c5bd1e42 100644
--- a/benchmarks/dsr1_fp4_mi355x_docker.sh
+++ b/benchmarks/dsr1_fp4_mi355x_docker.sh
@@ -51,7 +51,9 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
 
-MODEL_NAME="openai/$MODEL"
-run_eval --framework lighteval --task gsm8k --num-fewshot 5
+# After throughput, run evaluation (defaults to GSM8K)
+run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+#MODEL_NAME="openai/$MODEL"
+#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC
 append_lm_eval_summary
-set +x
+set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp4_mi355x_slurm.sh b/benchmarks/dsr1_fp4_mi355x_slurm.sh
index b0f1c33c0..a208e8d26 100644
--- a/benchmarks/dsr1_fp4_mi355x_slurm.sh
+++ b/benchmarks/dsr1_fp4_mi355x_slurm.sh
@@ -54,140 +54,9 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
 
-
-#######
-
-#
-## Evals setup 
-# !TODO clean env vars
-EVAL_RESULT_DIR=${EVAL_RESULT_DIR:-eval_out}
-OPENAI_SERVER_BASE="http://0.0.0.0:${PORT}"
-OPENAI_COMP_BASE="$OPENAI_SERVER_BASE/v1/completions"
-OPENAI_CHAT_BASE="$OPENAI_SERVER_BASE/v1/chat/completions"
-export OPENAI_API_KEY=${OPENAI_API_KEY:-EMPTY}
-
-# Patch to convert bypass regex error if content field is empty
-PATCH_DIR="$(mktemp -d)"
-cat > "$PATCH_DIR/sitecustomize.py" <<'PY'
-import re, sys, unicodedata
-from lm_eval.filters import extraction as ex
-
-def _s(x):  # coerce to str
-    return x if isinstance(x, str) else ""
-
-# --- Patch RegexFilter.apply (used by many datasets) ---
-_orig_regex_apply = ex.RegexFilter.apply
-def _safe_regex_apply(self, resps, docs):
-    out = []
-    for inst in resps:  # inst is a list of candidate responses for one doc
-        filtered = []
-        for resp in inst:
-            txt = _s(resp)
-            m = self.regex.findall(txt)
-            if m:
-                m = m[self.group_select]
-                if isinstance(m, tuple):
-                    m = [t for t in m if t]
-                    m = m[0] if m else self.fallback
-                m = m.strip()
-            else:
-                m = self.fallback
-            filtered.append(m)
-        out.append(filtered)
-    return out
-ex.RegexFilter.apply = _safe_regex_apply
-
-# --- Patch MultiChoiceRegexFilter.apply (used by GSM8K flexible-extract) ---
-_orig_mc_apply = ex.MultiChoiceRegexFilter.apply
-def _safe_mc_apply(self, resps, docs):
-    def find_match(regex, resp, convert_dict={}):
-        txt = _s(resp)
-        match = regex.findall(txt)
-        if match:
-            match = match[self.group_select]
-            if isinstance(match, tuple):
-                match = [m for m in match if m]
-                if match:
-                    match = match[0]
-        if match:
-            match = match.strip()
-            if match in convert_dict:
-                return convert_dict[match]
-            return match
-        return None
-
-    punct_tbl = dict.fromkeys(
-        i for i in range(sys.maxunicode)
-        if unicodedata.category(chr(i)).startswith("P")
-    )
-
-    def filter_ignores(st):
-        st = _s(st)
-        if self.regexes_to_ignore is not None:
-            for s in self.regexes_to_ignore:
-                st = re.sub(s, "", st)
-        if self.ignore_case:
-            st = st.lower()
-        if self.ignore_punctuation:
-            st = st.translate(punct_tbl)
-        return st
-
-    out = []
-    for r, doc in zip(resps, docs):
-        # Build fallback regexes from choices (A, B, C, ...) as in upstream
-        fallback_regexes, choice_to_alpha = [], {}
-        next_alpha = "A"
-        without_paren, without_paren_to_target = [], {}
-        for c in doc.get("choices", []):
-            m = filter_ignores(c.strip())
-            fallback_regexes.append(re.escape(m))
-            choice_to_alpha[m] = f"({next_alpha})"
-            without_paren.append(next_alpha)
-            without_paren_to_target[next_alpha] = f"({next_alpha})"
-            next_alpha = chr(ord(next_alpha) + 1)
-
-        fallback_regex = re.compile("|".join(fallback_regexes)) if fallback_regexes else None
-        without_paren_regex = re.compile(rf":[\s]*({'|'.join(without_paren)})") if without_paren else None
-
-        filtered = []
-        for resp in r:
-            m = find_match(self.regex, resp)
-            if not m and fallback_regex:
-                m = find_match(fallback_regex, filter_ignores(resp), choice_to_alpha)
-            if not m and without_paren_regex:
-                m = find_match(without_paren_regex, resp, without_paren_to_target)
-            if not m:
-                m = self.fallback
-            filtered.append(m)
-        out.append(filtered)
-    return out
-
-ex.MultiChoiceRegexFilter.apply = _safe_mc_apply
-PY
-
-export PYTHONPATH="${PATCH_DIR}:${PYTHONPATH:-}"
-set -x
-python3 -m lm_eval --model local-chat-completions --apply_chat_template \
---tasks ${EVAL_TASK:-gsm8k} \
---num_fewshot ${NUM_FEWSHOT:-5} \
---batch_size 2 \
---output_path "/workspace/${EVAL_RESULT_DIR}" \
---model_args "model=$MODEL,base_url=$OPENAI_CHAT_BASE,api_key=$OPENAI_API_KEY,eos_string=</s>,max_retries=3,num_concurrent=32,tokenized_requests=False" \
---gen_kwargs "max_tokens=8192,temperature=0,top_p=1"
-set +x
-
-# Append a Markdown table to the GitHub Actions job summary using helper in bench_serving
-if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
-python3 bench_serving/lm_eval_to_md.py \
-    --results-dir "/workspace/${EVAL_RESULT_DIR}" \
-    --task "${EVAL_TASK:-gsm8k}" \
-    --framework "${FRAMEWORK}" \
-    --precision "${PRECISION}" \
-    --tp "${TP:-1}" \
-    --ep "${EP_SIZE:-1}" \
-    --dp-attention "${DP_ATTENTION:-false}" \
-    >> "$GITHUB_STEP_SUMMARY" || true
-fi
-
-echo "Evaluation completed. Results in /workspace/${EVAL_RESULT_DIR}"
-exit 0
+# After throughput, run evaluation (defaults to GSM8K)
+run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+#MODEL_NAME="openai/$MODEL"
+#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC
+append_lm_eval_summary
+set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp8_b200_docker.sh b/benchmarks/dsr1_fp8_b200_docker.sh
index afb5c1655..c98e07d08 100644
--- a/benchmarks/dsr1_fp8_b200_docker.sh
+++ b/benchmarks/dsr1_fp8_b200_docker.sh
@@ -60,8 +60,8 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-#run_lm_eval --port "$PORT"
-MODEL_NAME="openai/$MODEL"
-run_eval --framework lighteval --task gsm8k --num-fewshot 5
+run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+#MODEL_NAME="openai/$MODEL"
+#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC
 append_lm_eval_summary
 set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh
index a9a1a04ff..7a072ff66 100644
--- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh
@@ -1,16 +1,13 @@
 #!/usr/bin/env bash
 
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
+# === Required Env Vars ===
 # MODEL
+# TP
+# CONC
 # ISL
 # OSL
 # MAX_MODEL_LEN
 # RANDOM_RANGE_RATIO
-# TP
-# CONC
 # RESULT_FILENAME
 # PORT_OFFSET
 # DP_ATTENTION
@@ -88,4 +85,4 @@ run_benchmark_serving \
     --num-prompts $(( $CONC * 10 )) \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
-    --result-dir /workspace/
+    --result-dir /workspace/
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp8_h200_slurm.sh b/benchmarks/dsr1_fp8_h200_slurm.sh
index 4c75cc17e..9ffd81f8d 100644
--- a/benchmarks/dsr1_fp8_h200_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_slurm.sh
@@ -65,7 +65,8 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-#run_lm_eval --port "$PORT"
-MODEL_NAME="openai/$MODEL"
-run_eval --framework lighteval --task gsm8k --num-fewshot 5
-append_lm_eval_summary
\ No newline at end of file
+run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+#MODEL_NAME="openai/$MODEL"
+#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC
+append_lm_eval_summary
+set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
index 797c2b67c..ac6bc167c 100644
--- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
@@ -1,16 +1,13 @@
 #!/usr/bin/env bash
 
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
+# === Required Env Vars ===
 # MODEL
+# TP
+# CONC
 # ISL
 # OSL
 # MAX_MODEL_LEN
 # RANDOM_RANGE_RATIO
-# TP
-# CONC
 # RESULT_FILENAME
 # PORT_OFFSET
 # DP_ATTENTION
@@ -89,10 +86,3 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
-
-# After throughput, run evaluation (defaults to GSM8K)
-#run_lm_eval --port "$PORT"
-MODEL_NAME="openai/$MODEL"
-run_eval --framework lighteval --task gsm8k --num-fewshot 5
-append_lm_eval_summary
-set +x
diff --git a/benchmarks/dsr1_fp8_mi300x_docker.sh b/benchmarks/dsr1_fp8_mi300x_docker.sh
index cec263322..9e7d58295 100644
--- a/benchmarks/dsr1_fp8_mi300x_docker.sh
+++ b/benchmarks/dsr1_fp8_mi300x_docker.sh
@@ -58,8 +58,8 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-#run_lm_eval --port "$PORT"
-MODEL_NAME="openai/$MODEL"
-run_eval --framework lighteval --task gsm8k --num-fewshot 5
+run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+#MODEL_NAME="openai/$MODEL"
+#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC
 append_lm_eval_summary
 set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp8_mi300x_slurm.sh b/benchmarks/dsr1_fp8_mi300x_slurm.sh
index f302c942f..ff7621425 100644
--- a/benchmarks/dsr1_fp8_mi300x_slurm.sh
+++ b/benchmarks/dsr1_fp8_mi300x_slurm.sh
@@ -67,7 +67,9 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
 
-MODEL_NAME="openai/$MODEL"
-run_eval --framework lighteval --task gsm8k --num-fewshot 5
+# After throughput, run evaluation (defaults to GSM8K)
+run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+#MODEL_NAME="openai/$MODEL"
+#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC
 append_lm_eval_summary
 set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp8_mi325x_docker.sh b/benchmarks/dsr1_fp8_mi325x_docker.sh
index ffe79da5c..4231f2df7 100644
--- a/benchmarks/dsr1_fp8_mi325x_docker.sh
+++ b/benchmarks/dsr1_fp8_mi325x_docker.sh
@@ -48,7 +48,9 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
     
-MODEL_NAME="openai/$MODEL"
-run_eval --framework lighteval --task gsm8k --num-fewshot 5
+# After throughput, run evaluation (defaults to GSM8K)
+run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+#MODEL_NAME="openai/$MODEL"
+#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC
 append_lm_eval_summary
 set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp8_mi325x_slurm.sh b/benchmarks/dsr1_fp8_mi325x_slurm.sh
index a19302f15..aef77d4ac 100644
--- a/benchmarks/dsr1_fp8_mi325x_slurm.sh
+++ b/benchmarks/dsr1_fp8_mi325x_slurm.sh
@@ -44,8 +44,7 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-MODEL_NAME="$MODEL"
-run_lm_eval --port "$PORT" --concurrent-requests $CONC
+run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
 #MODEL_NAME="openai/$MODEL"
 #run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC
 append_lm_eval_summary
diff --git a/benchmarks/dsr1_fp8_mi355x_docker.sh b/benchmarks/dsr1_fp8_mi355x_docker.sh
index a5a6eee2a..f692a2173 100644
--- a/benchmarks/dsr1_fp8_mi355x_docker.sh
+++ b/benchmarks/dsr1_fp8_mi355x_docker.sh
@@ -59,8 +59,8 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-#run_lm_eval --port "$PORT"
-MODEL_NAME="openai/$MODEL"
-run_eval --framework lighteval --task gsm8k --num-fewshot 5
+run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+#MODEL_NAME="openai/$MODEL"
+#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC
 append_lm_eval_summary
 set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp8_mi355x_slurm.sh b/benchmarks/dsr1_fp8_mi355x_slurm.sh
index fe64e8cdd..7916371bf 100644
--- a/benchmarks/dsr1_fp8_mi355x_slurm.sh
+++ b/benchmarks/dsr1_fp8_mi355x_slurm.sh
@@ -53,8 +53,8 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-#run_lm_eval --port "$PORT"
-MODEL_NAME="openai/$MODEL"
-run_eval --framework lighteval --task gsm8k --num-fewshot 5
+run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+#MODEL_NAME="openai/$MODEL"
+#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC
 append_lm_eval_summary
 set +x
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh
index f33e23bd6..9d095a8ef 100644
--- a/benchmarks/gptoss_fp4_b200_docker.sh
+++ b/benchmarks/gptoss_fp4_b200_docker.sh
@@ -80,7 +80,7 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
+run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
 #run_eval --framework lm-eval --port "$PORT"
-run_eval --framework lighteval --task gsm8k --num-fewshot 5
 append_lm_eval_summary
 set +x
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh
index 0ec2f325f..ff9af7854 100644
--- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh
+++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh
@@ -1,16 +1,13 @@
 #!/usr/bin/env bash
 
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
+# === Required Env Vars ===
 # MODEL
+# TP
+# CONC
 # ISL
 # OSL
 # MAX_MODEL_LEN
 # RANDOM_RANGE_RATIO
-# TP
-# CONC
 # RESULT_FILENAME
 # PORT_OFFSET
 # DP_ATTENTION
@@ -86,7 +83,6 @@ source "$(dirname "$0")/benchmark_lib.sh"
 # Wait for server to be ready
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-set -x
 run_benchmark_serving \
     --model "$MODEL" \
     --port "$PORT" \
@@ -97,4 +93,4 @@ run_benchmark_serving \
     --num-prompts $(( $CONC * 10 )) \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
-    --result-dir /workspace/
+    --result-dir /workspace/
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh
index 463f31b90..b5de6a296 100644
--- a/benchmarks/gptoss_fp4_h100_docker.sh
+++ b/benchmarks/gptoss_fp4_h100_docker.sh
@@ -61,7 +61,7 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-#run_eval --framework lm-eval --port "$PORT"
-run_eval --framework lighteval --task gsm8k --num-fewshot 5
+run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+#run_eval --framework lighteval --task gsm8k --num-fewshot 5
 append_lm_eval_summary
 set +x
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh
index 3c84888b9..6f9330f81 100644
--- a/benchmarks/gptoss_fp4_h100_slurm.sh
+++ b/benchmarks/gptoss_fp4_h100_slurm.sh
@@ -62,7 +62,7 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 )) #$(( (CONC * 3 + 1)/2 ))
+run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
 #run_eval --framework lighteval --task gsm8k --num-fewshot 5
 append_lm_eval_summary
 set +x
diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh
index e5a3a6961..3db49afa2 100644
--- a/benchmarks/gptoss_fp4_h200_slurm.sh
+++ b/benchmarks/gptoss_fp4_h200_slurm.sh
@@ -75,7 +75,7 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-#run_eval --framework lm-eval --port "$PORT"
-run_eval --framework lighteval --task gsm8k --num-fewshot 5
+run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+#run_eval --framework lighteval --task gsm8k --num-fewshot 5
 append_lm_eval_summary
 set +x
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh
index 620ddfb42..b6617ee9b 100644
--- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh
+++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh
@@ -1,16 +1,12 @@
 #!/usr/bin/env bash
 
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
+# === Required Env Vars ===
 # MODEL
+# TP
+# CONC
 # ISL
 # OSL
-# MAX_MODEL_LEN
 # RANDOM_RANGE_RATIO
-# TP
-# CONC
 # RESULT_FILENAME
 # PORT_OFFSET
 # DP_ATTENTION
@@ -50,12 +46,12 @@ trtllm-serve $MODEL \
 --max_num_tokens 20000 \
 --backend pytorch \
 --extra_llm_api_options gptoss-config.yml \
---ep_size=1 \
+--ep_size=$EP_SIZE \
 --trust_remote_code \
 --gpus_per_node 8 \
 --host 0.0.0.0 \
 --port $PORT \
---tp_size=1 \
+--tp_size=$TP \
 --pp_size=1 \
 > $SERVER_LOG 2>&1 &
 
@@ -77,8 +73,4 @@ run_benchmark_serving \
     --num-prompts $(( $CONC * 10 )) \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
-    --result-dir /workspace/
-
-# After throughput, run evaluation (defaults to GSM8K)
-#run_lm_eval --port "$PORT"
-append_lm_eval_summary
\ No newline at end of file
+    --result-dir /workspace/
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh
index 0dade438d..95b4678de 100644
--- a/benchmarks/gptoss_fp4_mi300x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi300x_docker.sh
@@ -62,7 +62,7 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-#run_eval --framework lm-eval --port "$PORT"
-run_eval --framework lighteval --task gsm8k --num-fewshot 5
+run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+#run_eval --framework lighteval --task gsm8k --num-fewshot 5
 append_lm_eval_summary
 set +x
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_mi300x_slurm.sh b/benchmarks/gptoss_fp4_mi300x_slurm.sh
index 8dbeefcf2..fe287eb50 100644
--- a/benchmarks/gptoss_fp4_mi300x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi300x_slurm.sh
@@ -71,6 +71,6 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-#run_lm_eval --port "$PORT"
+run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
 append_lm_eval_summary
 set +x
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_mi325x_docker.sh b/benchmarks/gptoss_fp4_mi325x_docker.sh
index db330a1b8..aa42c2888 100644
--- a/benchmarks/gptoss_fp4_mi325x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi325x_docker.sh
@@ -58,6 +58,6 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-#run_lm_eval --port "$PORT"
+run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
 append_lm_eval_summary
 set +x
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh
index 40ed241f7..a49729bb5 100644
--- a/benchmarks/gptoss_fp4_mi325x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh
@@ -1,16 +1,13 @@
 #!/usr/bin/bash
 
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
+# === Required Env Vars ===
 # MODEL
+# TP
+# CONC
 # ISL
 # OSL
 # MAX_MODEL_LEN
 # RANDOM_RANGE_RATIO
-# TP
-# CONC
 # RESULT_FILENAME
 
 echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh
index 305210cf7..a04bbdba8 100644
--- a/benchmarks/gptoss_fp4_mi355x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi355x_docker.sh
@@ -60,7 +60,7 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-#run_eval --framework lm-eval --port "$PORT"
-run_eval --framework lighteval --task gsm8k --num-fewshot 5
+run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+#run_eval --framework lighteval --task gsm8k --num-fewshot 5
 append_lm_eval_summary
 set +x
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_mi355x_slurm.sh b/benchmarks/gptoss_fp4_mi355x_slurm.sh
index a999303b1..b364c4758 100644
--- a/benchmarks/gptoss_fp4_mi355x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi355x_slurm.sh
@@ -61,6 +61,6 @@ run_benchmark_serving \
     --result-dir /workspace/
 
 # After throughput, run evaluation (defaults to GSM8K)
-#run_lm_eval --port "$PORT"
+run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
 append_lm_eval_summary
 set +x
\ No newline at end of file

From 688e2c52231d77795c9ef3e72580ec43f0d8440c Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 28 Nov 2025 13:46:21 +0800
Subject: [PATCH 146/214] all change, test deepseek

---
 .github/workflows/eval-gms8k.yml | 2 +-
 benchmarks/benchmark_lib.sh      | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 3e71d6885..081f94120 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -55,7 +55,7 @@ jobs:
       framework: sglang
       precision: fp8
       exp-name: ${{ inputs.exp-name || 'dsr1_gsm8k_poc' }}
-      tp: '8'
+      tp: '8' 
       ep: '1'
       dp-attn: false
       port: ${{ inputs.port || '8888' }}
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 29698e52e..83e48a352 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -290,6 +290,7 @@ run_lm_eval() {
     local openai_server_base="http://0.0.0.0:${port}"
     local openai_chat_base="${openai_server_base}/v1/chat/completions"
     export OPENAI_API_KEY=${OPENAI_API_KEY:-EMPTY}
+    MODEL_NAME=${MODEL_NAME:-$MODEL} # Prefer MODEL_NAME, else MODEL
 
     set -x
     python3 -m lm_eval --model local-chat-completions --apply_chat_template \

From 6b320ce8f1befeb539c5e9e55d6c27eb5ca7f914 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 28 Nov 2025 18:58:54 +0800
Subject: [PATCH 147/214] retest mi325x

---
 benchmarks/benchmark_lib.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 83e48a352..7d457530d 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -141,6 +141,7 @@ _patch_lm_eval_filters() {
     cat > "$patch_dir/sitecustomize.py" <<'PY'
 import re, sys, unicodedata
 from lm_eval.filters import extraction as ex
+from lm_eval.models.openai_completions import LocalChatCompletion as _LCC
 
 def _s(x):  # coerce to str
     return x if isinstance(x, str) else ""
@@ -265,7 +266,7 @@ run_lm_eval() {
     local task="${EVAL_TASK:-gsm8k}"
     local num_fewshot="${NUM_FEWSHOT:-5}"
     local results_dir="${EVAL_RESULT_DIR:-eval_out}"
-    local gen_max_tokens=4096
+    local gen_max_tokens=1024
     local temperature=0
     local top_p=1
     local concurrent_requests=32

From 9768deaf7fc1933270d81b013d58d4c3704f1642 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 28 Nov 2025 19:02:45 +0800
Subject: [PATCH 148/214] test b200

---
 .github/workflows/eval-gms8k.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 081f94120..0f0420fab 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -49,13 +49,13 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: mi325x-tw_0
-      image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' }}
-      model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }}
+      runner: b200-nvd_2
+      image: ${{ inputs.image || 'lmsysorg/sglang:v0.5.5-cu129-amd64' }}
+      model: ${{ inputs.model || 'nvidia/DeepSeek-R1-0528-FP4-V2' }}
       framework: sglang
-      precision: fp8
+      precision: fp4
       exp-name: ${{ inputs.exp-name || 'dsr1_gsm8k_poc' }}
-      tp: '8' 
+      tp: '8'
       ep: '1'
       dp-attn: false
       port: ${{ inputs.port || '8888' }}

From 4c339b4637a0163cf6e2bf64867fcb4375cf6be9 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 28 Nov 2025 19:38:02 +0800
Subject: [PATCH 149/214] clean b200

---
 .github/workflows/eval-tmpl.yml | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml
index 6e0d3bbb7..d2b43f569 100644
--- a/.github/workflows/eval-tmpl.yml
+++ b/.github/workflows/eval-tmpl.yml
@@ -79,6 +79,7 @@ jobs:
     steps:
       - name: Resource cleanup
         run: |
+          sudo rm -rf /home/nvadmin/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out/
           # Helper to avoid indefinite hangs on flaky tools (Docker/Slurm)
           safe_timeout() {
             if command -v timeout >/dev/null 2>&1; then
@@ -158,9 +159,4 @@ jobs:
 
       - name: Resource cleanup
         run: |
-          pkill -f litellm || true
-          pkill -f lighteval || true
-          sleep 2
-
-          sudo rm -rf .litellm_cache
           sudo rm -rf eval_out*
\ No newline at end of file

From efe94aa85a1f11c2c2234ed71b1d458b943604a8 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 28 Nov 2025 19:56:52 +0800
Subject: [PATCH 150/214] test h200

---
 .github/workflows/eval-gms8k.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 0f0420fab..92226ffad 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -49,11 +49,11 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: b200-nvd_2
+      runner: h200-cw_1
       image: ${{ inputs.image || 'lmsysorg/sglang:v0.5.5-cu129-amd64' }}
-      model: ${{ inputs.model || 'nvidia/DeepSeek-R1-0528-FP4-V2' }}
+      model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }}
       framework: sglang
-      precision: fp4
+      precision: fp8
       exp-name: ${{ inputs.exp-name || 'dsr1_gsm8k_poc' }}
       tp: '8'
       ep: '1'

From 705fc106e982ac4ced02a8a6c984785e0b6cf257 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 28 Nov 2025 20:42:30 +0800
Subject: [PATCH 151/214] H200 test

---
 .github/workflows/eval-gms8k.yml | 2 +-
 benchmarks/benchmark_lib.sh      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 92226ffad..f7658eaa9 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -52,7 +52,7 @@ jobs:
       runner: h200-cw_1
       image: ${{ inputs.image || 'lmsysorg/sglang:v0.5.5-cu129-amd64' }}
       model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }}
-      framework: sglang
+      framework: sglang 
       precision: fp8
       exp-name: ${{ inputs.exp-name || 'dsr1_gsm8k_poc' }}
       tp: '8'
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 7d457530d..529dbfa6e 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -266,7 +266,7 @@ run_lm_eval() {
     local task="${EVAL_TASK:-gsm8k}"
     local num_fewshot="${NUM_FEWSHOT:-5}"
     local results_dir="${EVAL_RESULT_DIR:-eval_out}"
-    local gen_max_tokens=1024
+    local gen_max_tokens=4096
     local temperature=0
     local top_p=1
     local concurrent_requests=32

From f79f243389d314dfd7698a5644cae58dbbc8a6a5 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 28 Nov 2025 23:52:06 +0800
Subject: [PATCH 152/214] B200-nvd2 sleep

---
 .github/workflows/drain-b200-nvd2.yml | 36 +++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 .github/workflows/drain-b200-nvd2.yml

diff --git a/.github/workflows/drain-b200-nvd2.yml b/.github/workflows/drain-b200-nvd2.yml
new file mode 100644
index 000000000..a7aeae576
--- /dev/null
+++ b/.github/workflows/drain-b200-nvd2.yml
@@ -0,0 +1,36 @@
+name: Drain b200-nvd_2
+
+on:
+  workflow_dispatch:
+    inputs:
+      minutes:
+        description: Minutes to hold (defaults to 72h if empty)
+        required: false
+        default: ""
+
+jobs:
+  hold:
+    # Pin specifically to the self-hosted runner label for b200-nvd_2
+    runs-on: [b200-nvd_2]
+    # Many orgs cap at 72h; adjust if your org allows more
+    timeout-minutes: ${{ fromJSON(github.event.inputs.minutes || '0') > 0 && fromJSON(github.event.inputs.minutes) || 4320 }}
+    steps:
+      - name: Start drain
+        shell: bash
+        run: |
+          set -euo pipefail
+          echo "Holding runner: $RUNNER_NAME"
+          echo "Runner OS/Arch: $RUNNER_OS / $RUNNER_ARCH"
+          echo "Started at: $(date -Iseconds)"
+          echo "Cancel this workflow run to release the runner."
+
+      - name: Hold indefinitely (until timeout or cancel)
+        shell: bash
+        run: |
+          set -euo pipefail
+          trap 'echo "Release signal received at $(date -Iseconds)"; exit 0' INT TERM
+          while true; do
+            echo "Still holding at $(date -Iseconds)"
+            sleep 1800
+          done
+

From d9a4fede00258ad508280362b69994ae704dc057 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 28 Nov 2025 23:58:49 +0800
Subject: [PATCH 153/214] B200-nvd2 sleep

---
 .github/workflows/drain-b200-nvd2.yml | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/drain-b200-nvd2.yml b/.github/workflows/drain-b200-nvd2.yml
index a7aeae576..41bf0f027 100644
--- a/.github/workflows/drain-b200-nvd2.yml
+++ b/.github/workflows/drain-b200-nvd2.yml
@@ -11,9 +11,9 @@ on:
 jobs:
   hold:
     # Pin specifically to the self-hosted runner label for b200-nvd_2
-    runs-on: [b200-nvd_2]
-    # Many orgs cap at 72h; adjust if your org allows more
-    timeout-minutes: ${{ fromJSON(github.event.inputs.minutes || '0') > 0 && fromJSON(github.event.inputs.minutes) || 4320 }}
+    runs-on: [self-hosted, b200-nvd_2]
+    # Hold for 24h by default (override by canceling anytime)
+    timeout-minutes: 1440
     steps:
       - name: Start drain
         shell: bash
@@ -33,4 +33,3 @@ jobs:
             echo "Still holding at $(date -Iseconds)"
             sleep 1800
           done
-

From 8c6b944ed5c939b74428bf1439600060f67b7cdf Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 29 Nov 2025 00:00:04 +0800
Subject: [PATCH 154/214] B200-nvd2 sleep

---
 .github/workflows/drain-b200-nvd2.yml | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/drain-b200-nvd2.yml b/.github/workflows/drain-b200-nvd2.yml
index 41bf0f027..08646c3b4 100644
--- a/.github/workflows/drain-b200-nvd2.yml
+++ b/.github/workflows/drain-b200-nvd2.yml
@@ -1,12 +1,9 @@
 name: Drain b200-nvd_2
 
 on:
-  workflow_dispatch:
-    inputs:
-      minutes:
-        description: Minutes to hold (defaults to 72h if empty)
-        required: false
-        default: ""
+  push:
+    paths:
+      - '.github/workflows/drain-b200-nvd2.yml'
 
 jobs:
   hold:

From 28a026fd68b465f0e0b6e05e26310acf4cfc4f4d Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 29 Nov 2025 02:36:22 +0800
Subject: [PATCH 155/214] mi325x test

---
 .github/workflows/eval-gms8k.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index f7658eaa9..0734aca4c 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -49,10 +49,10 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: h200-cw_1
-      image: ${{ inputs.image || 'lmsysorg/sglang:v0.5.5-cu129-amd64' }}
+      runner: mi325x-tw_1
+      image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' }}
       model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }}
-      framework: sglang 
+      framework: sglang
       precision: fp8
       exp-name: ${{ inputs.exp-name || 'dsr1_gsm8k_poc' }}
       tp: '8'

From c4bd3d2e6b086644777b90cd620f69a6e6f3a9e2 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 29 Nov 2025 02:37:42 +0800
Subject: [PATCH 156/214] mi325x test, no text, no empty fix

---
 .github/workflows/eval-gms8k.yml |   2 +-
 benchmarks/benchmark_lib.sh      | 132 ++++++++++---------------------
 2 files changed, 41 insertions(+), 93 deletions(-)

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 0734aca4c..3e71d6885 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -49,7 +49,7 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: mi325x-tw_1
+      runner: mi325x-tw_0
       image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' }}
       model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }}
       framework: sglang
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 529dbfa6e..5565756ca 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -133,107 +133,17 @@ _install_lm_eval_deps() {
         "git+https://github.com/EleutherAI/lm-evaluation-harness.git@main" || true
 }
 
+# Patch lm-eval filters to be robust to empty strings via sitecustomize
 # Patch lm-eval filters to be robust to empty strings via sitecustomize
 _patch_lm_eval_filters() {
     set +x
     local patch_dir
     patch_dir="$(mktemp -d)"
     cat > "$patch_dir/sitecustomize.py" <<'PY'
-import re, sys, unicodedata
+import re, sys, unicodedata, json
 from lm_eval.filters import extraction as ex
 from lm_eval.models.openai_completions import LocalChatCompletion as _LCC
 
-def _s(x):  # coerce to str
-    return x if isinstance(x, str) else ""
-
-# --- Patch RegexFilter.apply ---
-_orig_regex_apply = ex.RegexFilter.apply
-def _safe_regex_apply(self, resps, docs):
-    out = []
-    for inst in resps:
-        filtered = []
-        for resp in inst:
-            txt = _s(resp)
-            m = self.regex.findall(txt)
-            if m:
-                m = m[self.group_select]
-                if isinstance(m, tuple):
-                    m = [t for t in m if t]
-                    m = m[0] if m else self.fallback
-                m = m.strip()
-            else:
-                m = self.fallback
-            filtered.append(m)
-        out.append(filtered)
-    return out
-ex.RegexFilter.apply = _safe_regex_apply
-
-# --- Patch MultiChoiceRegexFilter.apply ---
-_orig_mc_apply = ex.MultiChoiceRegexFilter.apply
-def _safe_mc_apply(self, resps, docs):
-    def find_match(regex, resp, convert_dict={}):
-        txt = _s(resp)
-        match = regex.findall(txt)
-        if match:
-            match = match[self.group_select]
-            if isinstance(match, tuple):
-                match = [m for m in match if m]
-                if match:
-                    match = match[0]
-        if match:
-            match = match.strip()
-            if match in convert_dict:
-                return convert_dict[match]
-            return match
-        return None
-
-    punct_tbl = dict.fromkeys(
-        i for i in range(sys.maxunicode)
-        if unicodedata.category(chr(i)).startswith("P")
-    )
-
-    def filter_ignores(st):
-        st = _s(st)
-        if self.regexes_to_ignore is not None:
-            for s in self.regexes_to_ignore:
-                st = re.sub(s, "", st)
-        if self.ignore_case:
-            st = st.lower()
-        if self.ignore_punctuation:
-            st = st.translate(punct_tbl)
-        return st
-
-    out = []
-    for r, doc in zip(resps, docs):
-        fallback_regexes, choice_to_alpha = [], {}
-        next_alpha = "A"
-        without_paren, without_paren_to_target = [], {}
-        for c in doc.get("choices", []):
-            m = filter_ignores(c.strip())
-            fallback_regexes.append(re.escape(m))
-            choice_to_alpha[m] = f"({next_alpha})"
-            without_paren.append(next_alpha)
-            without_paren_to_target[next_alpha] = f"({next_alpha})"
-            next_alpha = chr(ord(next_alpha) + 1)
-
-        fallback_regex = re.compile("|".join(fallback_regexes)) if fallback_regexes else None
-        without_paren_regex = re.compile(rf":[\s]*({'|'.join(without_paren)})") if without_paren else None
-
-        filtered = []
-        for resp in r:
-            m = find_match(self.regex, resp)
-            if not m and fallback_regex:
-                m = find_match(fallback_regex, filter_ignores(resp), choice_to_alpha)
-            if not m and without_paren_regex:
-                m = find_match(without_paren_regex, resp, without_paren_to_target)
-            if not m:
-                m = self.fallback
-            filtered.append(m)
-        out.append(filtered)
-    return out
-
-ex.MultiChoiceRegexFilter.apply = _safe_mc_apply
-
 def _le_parse_generations(outputs, **kwargs):
       res = []
       if not isinstance(outputs, list):
@@ -256,6 +166,44 @@ def _le_parse_generations(outputs, **kwargs):
 
 # Keep staticmethod semantics
 _LCC.parse_generations = staticmethod(_le_parse_generations)
+
+# --- Patch TemplateAPI.apply_chat_template to avoid injecting "type": "text" ---
+try:
+    from lm_eval.models import api_models as _api_models
+    _TemplateAPI = _api_models.TemplateAPI
+    _JsonChatStr = _api_models.JsonChatStr
+except Exception:
+    _TemplateAPI = None
+    _JsonChatStr = None
+
+if _TemplateAPI is not None and _JsonChatStr is not None:
+    _orig_apply_chat_template = _TemplateAPI.apply_chat_template
+
+    def _patched_apply_chat_template(
+        self,
+        chat_history,
+        add_generation_prompt: bool = True,
+    ):
+        """Applies a chat template to a list of chat history between user and model."""
+        if self.tokenizer_backend == "huggingface" and self.tokenized_requests:
+            return self.tokenizer.apply_chat_template(
+                chat_history,
+                tokenize=False,
+                add_generation_prompt=add_generation_prompt,
+                continue_final_message=not add_generation_prompt,
+            )
+        elif self.tokenizer_backend == "remote" and self.tokenized_requests:
+            return chat_history
+        else:
+            # NOTE: we no longer inject `"type": "text"` when tokenizer is None / non-HF
+            return _JsonChatStr(
+                json.dumps(
+                    [{**item} for item in chat_history],
+                    ensure_ascii=False,
+                )
+            )
+
+    _TemplateAPI.apply_chat_template = _patched_apply_chat_template
 PY
     export PYTHONPATH="${patch_dir}:${PYTHONPATH:-}"
 }

From 14068bc89655cd893b46a49459001273f3f9c813 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 29 Nov 2025 17:34:28 +0800
Subject: [PATCH 157/214] h100, tmp eval_out

---
 .github/workflows/benchmark-tmpl.yml         |  6 ++-
 .github/workflows/eval-gms8k.yml             | 14 ++---
 .github/workflows/eval-tmpl.yml              |  6 +--
 .github/workflows/full-sweep-test.yml        |  6 +++
 benchmarks/benchmark_lib.sh                  | 15 +++---
 benchmarks/dsr1_fp4_b200_docker.sh           | 10 ++--
 benchmarks/dsr1_fp4_mi355x_docker.sh         | 10 ++--
 benchmarks/dsr1_fp4_mi355x_slurm.sh          | 10 ++--
 benchmarks/dsr1_fp8_b200_docker.sh           | 10 ++--
 benchmarks/dsr1_fp8_h200_slurm.sh            | 10 ++--
 benchmarks/dsr1_fp8_mi300x_docker.sh         | 10 ++--
 benchmarks/dsr1_fp8_mi300x_slurm.sh          | 10 ++--
 benchmarks/dsr1_fp8_mi325x_docker.sh         | 10 ++--
 benchmarks/dsr1_fp8_mi325x_slurm.sh          | 10 ++--
 benchmarks/dsr1_fp8_mi355x_docker.sh         | 10 ++--
 benchmarks/dsr1_fp8_mi355x_slurm.sh          | 10 ++--
 benchmarks/gptoss_fp4_b200_docker.sh         |  9 ++--
 benchmarks/gptoss_fp4_h100_docker.sh         |  9 ++--
 benchmarks/gptoss_fp4_h100_slurm.sh          | 11 ++--
 benchmarks/gptoss_fp4_h200_slurm.sh          |  9 ++--
 benchmarks/gptoss_fp4_mi300x_docker.sh       |  9 ++--
 benchmarks/gptoss_fp4_mi300x_slurm.sh        |  8 +--
 benchmarks/gptoss_fp4_mi325x_docker.sh       |  8 +--
 benchmarks/gptoss_fp4_mi325x_slurm.sh        |  9 ++--
 benchmarks/gptoss_fp4_mi355x_docker.sh       |  9 ++--
 benchmarks/gptoss_fp4_mi355x_slurm.sh        |  8 +--
 utils/matrix-logic/generate_sweep_configs.py | 54 ++++++++++++++++++++
 27 files changed, 188 insertions(+), 112 deletions(-)

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 4496ac001..436b156a8 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -45,7 +45,10 @@ on:
         required: false
         type: string
         default: '0.8'
-
+      run-eval:
+        type: boolean
+        required: false
+        default: false
 env:
   HF_TOKEN: ${{ secrets.HF_TOKEN }}
   HF_HUB_CACHE: '/mnt/hf_hub_cache/'
@@ -62,6 +65,7 @@ env:
   EP_SIZE: ${{ inputs.ep }}
   DP_ATTENTION: ${{ inputs.dp-attn }}
   CONC: ${{ inputs.conc }}
+  RUN_EVAL: ${{ inputs.run-eval }}
 
 permissions:
   contents: read
diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
index 3e71d6885..5a7e7e823 100644
--- a/.github/workflows/eval-gms8k.yml
+++ b/.github/workflows/eval-gms8k.yml
@@ -49,13 +49,13 @@ jobs:
     uses: ./.github/workflows/eval-tmpl.yml
     secrets: inherit
     with:
-      runner: mi325x-tw_0
-      image: ${{ inputs.image || 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' }}
-      model: ${{ inputs.model || 'deepseek-ai/DeepSeek-R1-0528' }}
-      framework: sglang
-      precision: fp8
-      exp-name: ${{ inputs.exp-name || 'dsr1_gsm8k_poc' }}
-      tp: '8'
+      runner: h100-cw_0
+      image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
+      model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
+      framework: vllm
+      precision: fp4
+      exp-name: ${{ inputs.exp-name || 'gptoss_gsm8k_poc' }}
+      tp: '4'
       ep: '1'
       dp-attn: false
       port: ${{ inputs.port || '8888' }}
diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml
index d2b43f569..63c4164f1 100644
--- a/.github/workflows/eval-tmpl.yml
+++ b/.github/workflows/eval-tmpl.yml
@@ -155,8 +155,4 @@ jobs:
           path: |
             ${{ env.EVAL_RESULT_DIR }}/
             ${{ env.EVAL_RESULT_DIR }}/*
-            ${{ env.EVAL_RESULT_DIR }}/**
-
-      - name: Resource cleanup
-        run: |
-          sudo rm -rf eval_out*
\ No newline at end of file
+            ${{ env.EVAL_RESULT_DIR }}/**
\ No newline at end of file
diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml
index cf889e208..01bae1f5d 100644
--- a/.github/workflows/full-sweep-test.yml
+++ b/.github/workflows/full-sweep-test.yml
@@ -130,6 +130,7 @@ jobs:
             ep: ${{ matrix.config.ep }}
             dp-attn: ${{ matrix.config.dp-attn }}
             conc: ${{ matrix.config.conc }}
+            run-eval: ${{ matrix.config.run-eval }}
 
     collect-dsr1-1k1k-results:
         needs: benchmark-dsr1-1k1k
@@ -164,6 +165,7 @@ jobs:
             ep: ${{ matrix.config.ep }}
             dp-attn: ${{ matrix.config.dp-attn }}
             conc: ${{ matrix.config.conc }}
+            run-eval: ${{ matrix.config.run-eval }}
 
     collect-gptoss-1k1k-results:
         needs: benchmark-gptoss-1k1k
@@ -198,6 +200,7 @@ jobs:
             ep: ${{ matrix.config.ep }}
             dp-attn: ${{ matrix.config.dp-attn }}
             conc: ${{ matrix.config.conc }}
+            run-eval: ${{ matrix.config.run-eval }}
 
     collect-dsr1-8k1k-results:
         needs: benchmark-dsr1-8k1k
@@ -232,6 +235,7 @@ jobs:
             ep: ${{ matrix.config.ep }}
             dp-attn: ${{ matrix.config.dp-attn }}
             conc: ${{ matrix.config.conc }}
+            run-eval: ${{ matrix.config.run-eval }}
 
     collect-gptoss-8k1k-results:
         needs: benchmark-gptoss-8k1k
@@ -266,6 +270,7 @@ jobs:
             ep: ${{ matrix.config.ep }}
             dp-attn: ${{ matrix.config.dp-attn }}
             conc: ${{ matrix.config.conc }}
+            run-eval: ${{ matrix.config.run-eval }}
 
     # This is a workaround until we can integrate GB200 into master configs.
     benchmark-gb200-1k1k:
@@ -394,6 +399,7 @@ jobs:
             ep: ${{ matrix.config.ep }}
             dp-attn: ${{ matrix.config.dp-attn }}
             conc: ${{ matrix.config.conc }}
+            run-eval: ${{ matrix.config.run-eval }}
 
     collect-gptoss-1k8k-results:
         needs: benchmark-gptoss-1k8k
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 5565756ca..5f2b51ec7 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -213,7 +213,7 @@ run_lm_eval() {
     local port="${PORT:-8888}"
     local task="${EVAL_TASK:-gsm8k}"
     local num_fewshot="${NUM_FEWSHOT:-5}"
-    local results_dir="${EVAL_RESULT_DIR:-eval_out}"
+    local results_dir="${EVAL_RESULT_DIR:-$(mktemp -d /tmp/eval_out-XXXXXX)}"
     local gen_max_tokens=4096
     local temperature=0
     local top_p=1
@@ -241,11 +241,14 @@ run_lm_eval() {
     export OPENAI_API_KEY=${OPENAI_API_KEY:-EMPTY}
     MODEL_NAME=${MODEL_NAME:-$MODEL} # Prefer MODEL_NAME, else MODEL
 
+    # Export for append_lm_eval_summary to pick up
+    export EVAL_RESULT_DIR="$results_dir"
+
     set -x
     python3 -m lm_eval --model local-chat-completions --apply_chat_template \
       --tasks "utils/evals/${task}.yaml" \
       --num_fewshot "${num_fewshot}" \
-      --output_path "/workspace/${results_dir}" \
+      --output_path "${results_dir}" \
       --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=</s>,max_retries=2,num_concurrent=${concurrent_requests},tokenized_requests=False" \
       --gen_kwargs "max_tokens=${gen_max_tokens},temperature=${temperature},top_p=${top_p}"
     set +x
@@ -253,10 +256,9 @@ run_lm_eval() {
 
 append_lm_eval_summary() {
     set +x
-    local results_dir="${EVAL_RESULT_DIR:-eval_out}"
+    local results_dir="${EVAL_RESULT_DIR}"
     local task="${EVAL_TASK:-gsm8k}"
-    # Always render a local summary so the runner can pick it up
-    local out_dir="/workspace/${results_dir}"
+    local out_dir="${results_dir}"
     local summary_md="${out_dir}/SUMMARY.md"
     mkdir -p "$out_dir" || true
 
@@ -278,8 +280,9 @@ append_lm_eval_summary() {
             cat "$summary_md" >> "$GITHUB_STEP_SUMMARY" || true
         fi
     fi
-}
 
+    echo "Results saved to: ${summary_md}"
+}
 
 # ------------------------------
 # Lighteval + LiteLLM patching
diff --git a/benchmarks/dsr1_fp4_b200_docker.sh b/benchmarks/dsr1_fp4_b200_docker.sh
index 08469715e..31319c44e 100644
--- a/benchmarks/dsr1_fp4_b200_docker.sh
+++ b/benchmarks/dsr1_fp4_b200_docker.sh
@@ -48,9 +48,9 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
 
-# After throughput, run evaluation (defaults to GSM8K)
-run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
-#MODEL_NAME="openai/$MODEL"
-#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC
-append_lm_eval_summary
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    append_lm_eval_summary
+fi
 set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp4_mi355x_docker.sh b/benchmarks/dsr1_fp4_mi355x_docker.sh
index 2c5bd1e42..bad7c2854 100644
--- a/benchmarks/dsr1_fp4_mi355x_docker.sh
+++ b/benchmarks/dsr1_fp4_mi355x_docker.sh
@@ -51,9 +51,9 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
 
-# After throughput, run evaluation (defaults to GSM8K)
-run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
-#MODEL_NAME="openai/$MODEL"
-#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC
-append_lm_eval_summary
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    append_lm_eval_summary
+fi
 set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp4_mi355x_slurm.sh b/benchmarks/dsr1_fp4_mi355x_slurm.sh
index a208e8d26..31d6a94a9 100644
--- a/benchmarks/dsr1_fp4_mi355x_slurm.sh
+++ b/benchmarks/dsr1_fp4_mi355x_slurm.sh
@@ -54,9 +54,9 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
 
-# After throughput, run evaluation (defaults to GSM8K)
-run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
-#MODEL_NAME="openai/$MODEL"
-#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC
-append_lm_eval_summary
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    append_lm_eval_summary
+fi
 set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp8_b200_docker.sh b/benchmarks/dsr1_fp8_b200_docker.sh
index c98e07d08..73696ff4a 100644
--- a/benchmarks/dsr1_fp8_b200_docker.sh
+++ b/benchmarks/dsr1_fp8_b200_docker.sh
@@ -59,9 +59,9 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
 
-# After throughput, run evaluation (defaults to GSM8K)
-run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
-#MODEL_NAME="openai/$MODEL"
-#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC
-append_lm_eval_summary
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    append_lm_eval_summary
+fi
 set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp8_h200_slurm.sh b/benchmarks/dsr1_fp8_h200_slurm.sh
index 9ffd81f8d..088e71c0a 100644
--- a/benchmarks/dsr1_fp8_h200_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_slurm.sh
@@ -64,9 +64,9 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
 
-# After throughput, run evaluation (defaults to GSM8K)
-run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
-#MODEL_NAME="openai/$MODEL"
-#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC
-append_lm_eval_summary
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    append_lm_eval_summary
+fi
 set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp8_mi300x_docker.sh b/benchmarks/dsr1_fp8_mi300x_docker.sh
index 9e7d58295..8027ae1eb 100644
--- a/benchmarks/dsr1_fp8_mi300x_docker.sh
+++ b/benchmarks/dsr1_fp8_mi300x_docker.sh
@@ -57,9 +57,9 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
 
-# After throughput, run evaluation (defaults to GSM8K)
-run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
-#MODEL_NAME="openai/$MODEL"
-#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC
-append_lm_eval_summary
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    append_lm_eval_summary
+fi
 set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp8_mi300x_slurm.sh b/benchmarks/dsr1_fp8_mi300x_slurm.sh
index ff7621425..7e222726c 100644
--- a/benchmarks/dsr1_fp8_mi300x_slurm.sh
+++ b/benchmarks/dsr1_fp8_mi300x_slurm.sh
@@ -67,9 +67,9 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
 
-# After throughput, run evaluation (defaults to GSM8K)
-run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
-#MODEL_NAME="openai/$MODEL"
-#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC
-append_lm_eval_summary
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    append_lm_eval_summary
+fi
 set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp8_mi325x_docker.sh b/benchmarks/dsr1_fp8_mi325x_docker.sh
index 4231f2df7..940d4b076 100644
--- a/benchmarks/dsr1_fp8_mi325x_docker.sh
+++ b/benchmarks/dsr1_fp8_mi325x_docker.sh
@@ -48,9 +48,9 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
     
-# After throughput, run evaluation (defaults to GSM8K)
-run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
-#MODEL_NAME="openai/$MODEL"
-#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC
-append_lm_eval_summary
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    append_lm_eval_summary
+fi
 set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp8_mi325x_slurm.sh b/benchmarks/dsr1_fp8_mi325x_slurm.sh
index aef77d4ac..8adf3d745 100644
--- a/benchmarks/dsr1_fp8_mi325x_slurm.sh
+++ b/benchmarks/dsr1_fp8_mi325x_slurm.sh
@@ -43,9 +43,9 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
 
-# After throughput, run evaluation (defaults to GSM8K)
-run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
-#MODEL_NAME="openai/$MODEL"
-#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC
-append_lm_eval_summary
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    append_lm_eval_summary
+fi
 set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp8_mi355x_docker.sh b/benchmarks/dsr1_fp8_mi355x_docker.sh
index f692a2173..5aa0afd3e 100644
--- a/benchmarks/dsr1_fp8_mi355x_docker.sh
+++ b/benchmarks/dsr1_fp8_mi355x_docker.sh
@@ -58,9 +58,9 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
 
-# After throughput, run evaluation (defaults to GSM8K)
-run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
-#MODEL_NAME="openai/$MODEL"
-#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC
-append_lm_eval_summary
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    append_lm_eval_summary
+fi
 set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp8_mi355x_slurm.sh b/benchmarks/dsr1_fp8_mi355x_slurm.sh
index 7916371bf..ec734d1c6 100644
--- a/benchmarks/dsr1_fp8_mi355x_slurm.sh
+++ b/benchmarks/dsr1_fp8_mi355x_slurm.sh
@@ -52,9 +52,9 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
 
-# After throughput, run evaluation (defaults to GSM8K)
-run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
-#MODEL_NAME="openai/$MODEL"
-#run_eval --framework lighteval --task gsm8k_long --num-fewshot 5 --concurrent-requests $CONC
-append_lm_eval_summary
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    append_lm_eval_summary
+fi
 set +x
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh
index 9d095a8ef..64613e11b 100644
--- a/benchmarks/gptoss_fp4_b200_docker.sh
+++ b/benchmarks/gptoss_fp4_b200_docker.sh
@@ -79,8 +79,9 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
 
-# After throughput, run evaluation (defaults to GSM8K)
-run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
-#run_eval --framework lm-eval --port "$PORT"
-append_lm_eval_summary
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    append_lm_eval_summary
+fi
 set +x
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh
index b5de6a296..dd552d8c3 100644
--- a/benchmarks/gptoss_fp4_h100_docker.sh
+++ b/benchmarks/gptoss_fp4_h100_docker.sh
@@ -60,8 +60,9 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
 
-# After throughput, run evaluation (defaults to GSM8K)
-run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
-#run_eval --framework lighteval --task gsm8k --num-fewshot 5
-append_lm_eval_summary
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    append_lm_eval_summary
+fi
 set +x
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh
index 6f9330f81..5922220ac 100644
--- a/benchmarks/gptoss_fp4_h100_slurm.sh
+++ b/benchmarks/gptoss_fp4_h100_slurm.sh
@@ -61,8 +61,9 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
 
-# After throughput, run evaluation (defaults to GSM8K)
-run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
-#run_eval --framework lighteval --task gsm8k --num-fewshot 5
-append_lm_eval_summary
-set +x
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    append_lm_eval_summary
+fi
+set +x
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh
index 3db49afa2..7ca2ab001 100644
--- a/benchmarks/gptoss_fp4_h200_slurm.sh
+++ b/benchmarks/gptoss_fp4_h200_slurm.sh
@@ -74,8 +74,9 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
 
-# After throughput, run evaluation (defaults to GSM8K)
-run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
-#run_eval --framework lighteval --task gsm8k --num-fewshot 5
-append_lm_eval_summary
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    append_lm_eval_summary
+fi
 set +x
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh
index 95b4678de..90b8eb42b 100644
--- a/benchmarks/gptoss_fp4_mi300x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi300x_docker.sh
@@ -61,8 +61,9 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
 
-# After throughput, run evaluation (defaults to GSM8K)
-run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
-#run_eval --framework lighteval --task gsm8k --num-fewshot 5
-append_lm_eval_summary
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    append_lm_eval_summary
+fi
 set +x
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_mi300x_slurm.sh b/benchmarks/gptoss_fp4_mi300x_slurm.sh
index fe287eb50..d97355818 100644
--- a/benchmarks/gptoss_fp4_mi300x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi300x_slurm.sh
@@ -70,7 +70,9 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
 
-# After throughput, run evaluation (defaults to GSM8K)
-run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
-append_lm_eval_summary
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    append_lm_eval_summary
+fi
 set +x
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_mi325x_docker.sh b/benchmarks/gptoss_fp4_mi325x_docker.sh
index aa42c2888..a5cd09637 100644
--- a/benchmarks/gptoss_fp4_mi325x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi325x_docker.sh
@@ -57,7 +57,9 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
 
-# After throughput, run evaluation (defaults to GSM8K)
-run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
-append_lm_eval_summary
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    append_lm_eval_summary
+fi
 set +x
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh
index a49729bb5..80b58b45b 100644
--- a/benchmarks/gptoss_fp4_mi325x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh
@@ -66,8 +66,9 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
 
-# After throughput, run evaluation (defaults to GSM8K)
-run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
-#run_eval --framework lighteval --task gsm8k --num-fewshot 5
-append_lm_eval_summary
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    append_lm_eval_summary
+fi
 set +x
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh
index a04bbdba8..e75b06d15 100644
--- a/benchmarks/gptoss_fp4_mi355x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi355x_docker.sh
@@ -59,8 +59,9 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
 
-# After throughput, run evaluation (defaults to GSM8K)
-run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
-#run_eval --framework lighteval --task gsm8k --num-fewshot 5
-append_lm_eval_summary
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    append_lm_eval_summary
+fi
 set +x
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_mi355x_slurm.sh b/benchmarks/gptoss_fp4_mi355x_slurm.sh
index b364c4758..845398ca0 100644
--- a/benchmarks/gptoss_fp4_mi355x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi355x_slurm.sh
@@ -60,7 +60,9 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
 
-# After throughput, run evaluation (defaults to GSM8K)
-run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
-append_lm_eval_summary
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    append_lm_eval_summary
+fi
 set +x
\ No newline at end of file
diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py
index f0d9a4390..1110e3c30 100644
--- a/utils/matrix-logic/generate_sweep_configs.py
+++ b/utils/matrix-logic/generate_sweep_configs.py
@@ -31,6 +31,9 @@
 FIELD_MAX_MODEL_LEN = 'max-model-len'
 FIELD_EXP_NAME = 'exp-name'
 
+# Eval
+FIELD_RUN_EVAL = 'run-eval'
+
 seq_len_stoi = {
     "1k1k": (1024, 1024),
     "1k8k": (1024, 8192),
@@ -67,6 +70,7 @@ class MatrixEntry(BaseModel):
     conc: int
     max_model_len: int = Field(alias='max-model-len')
     exp_name: str = Field(alias='exp-name')
+    run_eval: bool = Field(alias='run-eval', default=False)
 
 
 def validate_matrix_output(matrix_values: List[dict]) -> List[dict]:
@@ -82,6 +86,53 @@ def validate_matrix_output(matrix_values: List[dict]) -> List[dict]:
             raise ValueError(f"Matrix entry at index {i} failed validation:\n{e}")
     return matrix_values
 
+def mark_eval_entries(matrix_values: List[dict]) -> List[dict]:
+    """Mark entries that should run evaluation.
+    
+    For each unique (model, runner, isl, osl) combination:
+    - Mark highest TP with highest conc
+    - Mark lowest TP with highest conc
+    """
+    from collections import defaultdict
+    
+    # Group entries by (model, runner, isl, osl)
+    groups = defaultdict(list)
+    for i, entry in enumerate(matrix_values):
+        key = (entry[FIELD_MODEL], entry[FIELD_RUNNER], entry[FIELD_ISL], entry[FIELD_OSL])
+        groups[key].append((i, entry))
+    
+    # For each group, find highest TP/highest conc and lowest TP/highest conc
+    eval_indices = set()
+    for key, entries in groups.items():
+        if not entries:
+            continue
+        
+        # Find min and max TP values
+        min_tp = min(e[FIELD_TP] for _, e in entries)
+        max_tp = max(e[FIELD_TP] for _, e in entries)
+        
+        # Find highest conc for highest TP
+        highest_tp_entries = [(i, e) for i, e in entries if e[FIELD_TP] == max_tp]
+        if highest_tp_entries:
+            max_conc_highest_tp = max(e[FIELD_CONC] for _, e in highest_tp_entries)
+            for i, e in highest_tp_entries:
+                if e[FIELD_CONC] == max_conc_highest_tp:
+                    eval_indices.add(i)
+        
+        # Find highest conc for lowest TP (only if different from max_tp)
+        if min_tp != max_tp:
+            lowest_tp_entries = [(i, e) for i, e in entries if e[FIELD_TP] == min_tp]
+            if lowest_tp_entries:
+                max_conc_lowest_tp = max(e[FIELD_CONC] for _, e in lowest_tp_entries)
+                for i, e in lowest_tp_entries:
+                    if e[FIELD_CONC] == max_conc_lowest_tp:
+                        eval_indices.add(i)
+    
+    # Mark the selected entries
+    for i, entry in enumerate(matrix_values):
+        entry[FIELD_RUN_EVAL] = i in eval_indices
+    
+    return matrix_values
 
 def validate_master_configs_structure(all_config_data):
     """Validate the structure of all master config entries.
@@ -957,6 +1008,9 @@ def main():
     else:
         parser.error(f"Unknown command: {args.command}")
 
+    # Choose eval
+    matrix_values = mark_eval_entries(matrix_values)
+
     # Validate output before printing
     validate_matrix_output(matrix_values)
 

From af2c3855c2762ee9fa2f7f0f96c5333d8263499d Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 29 Nov 2025 17:46:33 +0800
Subject: [PATCH 158/214] h100, tmp eval_out, sweep integration

---
 .github/workflows/eval-tmpl.yml | 1 +
 utils/evals/gsm8k.yaml          | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml
index 63c4164f1..04a2eebed 100644
--- a/.github/workflows/eval-tmpl.yml
+++ b/.github/workflows/eval-tmpl.yml
@@ -70,6 +70,7 @@ env:
   OSL: 1024
   RANDOM_RANGE_RATIO: '0.8'
   RESULT_FILENAME: results
+  RUN_EVAL: true
   
 jobs:
   eval:
diff --git a/utils/evals/gsm8k.yaml b/utils/evals/gsm8k.yaml
index 3d9e5ce3b..ab3113dc2 100644
--- a/utils/evals/gsm8k.yaml
+++ b/utils/evals/gsm8k.yaml
@@ -33,6 +33,7 @@ filter_list:
   - name: "strict-match"
     filter:
       - function: "regex"
+        group_select: -1
         regex_pattern: "#### (\\-?[0-9\\.\\,]+)"
       - function: "take_first"
   - name: "flexible-extract"

From 5e1d68d711895add73aba0b2a1d70429e413ec83 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 29 Nov 2025 19:18:06 +0800
Subject: [PATCH 159/214] touch up sweep naming, remove funny triton error

---
 .github/workflows/benchmark-tmpl.yml  |  2 +-
 .github/workflows/eval-tmpl.yml       | 13 +++----------
 .github/workflows/full-sweep-test.yml |  4 ++++
 benchmarks/benchmark_lib.sh           |  4 +++-
 4 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 436b156a8..23ea7d518 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -74,7 +74,7 @@ jobs:
   benchmark:
     runs-on: ${{ inputs.runner }}
     timeout-minutes: 180
-    name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.framework }} ${{ inputs.precision }} tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} conc=${{ inputs.conc }}'
+    name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.framework }} ${{ inputs.precision }} tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} conc=${{ inputs.conc }} eval=${{ inputs.run-eval }}'
     steps:
       - name: Resource cleanup
         run: |
diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml
index 04a2eebed..e4e65a581 100644
--- a/.github/workflows/eval-tmpl.yml
+++ b/.github/workflows/eval-tmpl.yml
@@ -63,7 +63,8 @@ env:
   EVAL_TASK: ${{ inputs['eval-task'] }}
   NUM_FEWSHOT: ${{ inputs['num-fewshot'] }}
   LIMIT: ${{ inputs.limit }}
-  EVAL_RESULT_DIR: eval_out
+  # Keep eval outputs only under /tmp
+  EVAL_RESULT_DIR: /tmp/eval_out
   CONC: '32'
   MAX_MODEL_LEN: '4096'
   ISL: 1024
@@ -148,12 +149,4 @@ jobs:
         run: |
           bash ./runners/launch_${RUNNER_NAME%%_*}.sh
 
-      - name: Upload eval artifacts
-        if: always()
-        uses: actions/upload-artifact@v5
-        with:
-          name: eval_${{ env.EXP_NAME }}_${{ runner.name }}
-          path: |
-            ${{ env.EVAL_RESULT_DIR }}/
-            ${{ env.EVAL_RESULT_DIR }}/*
-            ${{ env.EVAL_RESULT_DIR }}/**
\ No newline at end of file
+      # Intentionally no eval artifact uploads: eval outputs remain in /tmp only.
diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml
index 01bae1f5d..ed6132450 100644
--- a/.github/workflows/full-sweep-test.yml
+++ b/.github/workflows/full-sweep-test.yml
@@ -175,6 +175,7 @@ jobs:
         with:
             exp-name: "gptoss_1k1k"
 
+
     # DSR1 8K1K Benchmarks
     benchmark-dsr1-8k1k:
         needs: get-configs
@@ -245,6 +246,7 @@ jobs:
         with:
             exp-name: "gptoss_8k1k"
 
+
     # DSR1 1K8K Benchmarks
     benchmark-dsr1-1k8k:
         needs: get-configs
@@ -374,6 +376,7 @@ jobs:
         with:
             exp-name: "dsr1_1k8k"
 
+
     # GPTOSS 1K8K Benchmarks
     benchmark-gptoss-1k8k:
         needs: get-configs
@@ -409,6 +412,7 @@ jobs:
         with:
             exp-name: "gptoss_1k8k"
 
+
     calc-success-rate:
         needs:
             [
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 5f2b51ec7..05addc161 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -262,7 +262,7 @@ append_lm_eval_summary() {
     local summary_md="${out_dir}/SUMMARY.md"
     mkdir -p "$out_dir" || true
 
-    python3 utils/lm_eval_to_md.py \
+    PYTHONNOUSERSITE=1 PYTHONPATH="" python3 -S utils/lm_eval_to_md.py \
         --results-dir "$out_dir" \
         --task "${task}" \
         --framework "${FRAMEWORK}" \
@@ -281,6 +281,8 @@ append_lm_eval_summary() {
         fi
     fi
 
+    # Note: Per policy, eval outputs stay under /tmp only; do not copy to workspace.
+
     echo "Results saved to: ${summary_md}"
 }
 

From 1a3262f574d3908ee89889916c185a9edbe96e10 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 29 Nov 2025 23:14:07 +0800
Subject: [PATCH 160/214] touch up sweep summary

---
 benchmarks/benchmark_lib.sh  | 26 +++++++++++++++++++++++++-
 runners/launch_b200-nb.sh    | 10 ----------
 runners/launch_b200-nv.sh    | 10 ----------
 runners/launch_h100-cw.sh    | 10 ----------
 runners/launch_h200-cw.sh    | 10 ----------
 runners/launch_h200-nb.sh    | 10 ----------
 runners/launch_h200-nv.sh    | 10 ----------
 runners/launch_mi325x-amd.sh | 10 ----------
 runners/launch_mi325x-tw.sh  | 10 ----------
 9 files changed, 25 insertions(+), 81 deletions(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 05addc161..a2059db26 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -262,6 +262,22 @@ append_lm_eval_summary() {
     local summary_md="${out_dir}/SUMMARY.md"
     mkdir -p "$out_dir" || true
 
+    # Write minimal meta for collectors that expect it
+    local meta_json="${out_dir}/meta_env.json"
+    local model_name="${MODEL_NAME:-$MODEL}"
+    local dp_json="false"
+    if [ "${DP_ATTENTION}" = "true" ]; then dp_json="true"; fi
+    cat > "${meta_json}" <<META
+{
+  "framework": "${FRAMEWORK:-unknown}",
+  "precision": "${PRECISION:-unknown}",
+  "tp": ${TP:-1},
+  "ep": ${EP_SIZE:-1},
+  "dp_attention": ${dp_json},
+  "model": "${model_name:-}"
+}
+META
+
     PYTHONNOUSERSITE=1 PYTHONPATH="" python3 -S utils/lm_eval_to_md.py \
         --results-dir "$out_dir" \
         --task "${task}" \
@@ -561,11 +577,19 @@ run_lighteval_eval() {
     local MODEL_ARGS="model_name=${lite_model},base_url=${base_url},api_key=${OPENAI_API_KEY},generation_parameters={temperature:0.0,top_p=1,max_new_tokens:2048},concurrent_requests=${concurrent_requests}"
     local TASK_SPEC="${task}|${num_fewshot}"
 
+    # Respect absolute paths (e.g., /tmp/eval_out); otherwise write under /workspace
+    local output_dir
+    if [[ "$results_dir" = /* ]]; then
+        output_dir="$results_dir"
+    else
+        output_dir="/workspace/${results_dir}"
+    fi
+
     set -x
     lighteval endpoint litellm \
         "${MODEL_ARGS}" \
         "${TASK_SPEC}" \
-        --output-dir "/workspace/${results_dir}" \
+        --output-dir "${output_dir}" \
         --custom-tasks utils/evals/custom_gsm8k.py \
         --max-samples "${max_samples}"
     set +x
diff --git a/runners/launch_b200-nb.sh b/runners/launch_b200-nb.sh
index 1502d0268..f9b68c025 100644
--- a/runners/launch_b200-nb.sh
+++ b/runners/launch_b200-nb.sh
@@ -15,13 +15,3 @@ srun --partition=$PARTITION --gres=gpu:$TP --exclusive \
 bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_slurm.sh
 
 scancel $JOB_ID
-
-# Append eval summary within this same step when available
-if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
-  GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")"
-  if [ -d "${GH_SUM_DIR}" ]; then
-    if [ -f "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" ]; then
-      cat "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" >> "${GITHUB_STEP_SUMMARY}" || true
-    fi
-  fi
-fi
diff --git a/runners/launch_b200-nv.sh b/runners/launch_b200-nv.sh
index 8a1afff8e..243e624f9 100644
--- a/runners/launch_b200-nv.sh
+++ b/runners/launch_b200-nv.sh
@@ -23,13 +23,3 @@ srun --jobid=$JOB_ID \
 bash benchmarks/${MODEL_CODE}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_slurm.sh
 
 scancel $JOB_ID
-
-# Append eval summary within this same step when available
-if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
-  GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")"
-  if [ -d "${GH_SUM_DIR}" ]; then
-    if [ -f "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" ]; then
-      cat "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" >> "${GITHUB_STEP_SUMMARY}" || true
-    fi
-  fi
-fi
diff --git a/runners/launch_h100-cw.sh b/runners/launch_h100-cw.sh
index 864dc9c95..0179bdd57 100644
--- a/runners/launch_h100-cw.sh
+++ b/runners/launch_h100-cw.sh
@@ -18,13 +18,3 @@ srun --jobid=$JOB_ID \
 bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_h100_slurm.sh
 
 scancel $JOB_ID
-
-# Append eval summary within this same step when available
-if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
-  GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")"
-  if [ -d "${GH_SUM_DIR}" ]; then
-    if [ -f "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" ]; then
-      cat "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" >> "${GITHUB_STEP_SUMMARY}" || true
-    fi
-  fi
-fi
diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh
index 431e027f2..dd4937606 100644
--- a/runners/launch_h200-cw.sh
+++ b/runners/launch_h200-cw.sh
@@ -30,13 +30,3 @@ srun --jobid=$JOB_ID \
 bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}_slurm.sh
 
 scancel $JOB_ID
-
-# Append eval summary within this same step when available
-if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
-  GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")"
-  if [ -d "${GH_SUM_DIR}" ]; then
-    if [ -f "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" ]; then
-      cat "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" >> "${GITHUB_STEP_SUMMARY}" || true
-    fi
-  fi
-fi
diff --git a/runners/launch_h200-nb.sh b/runners/launch_h200-nb.sh
index 19d6e82ba..c76b366d2 100644
--- a/runners/launch_h200-nb.sh
+++ b/runners/launch_h200-nb.sh
@@ -30,13 +30,3 @@ srun --jobid=$JOB_ID \
 bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}_slurm.sh
 
 scancel $JOB_ID
-
-# Append eval summary within this same step when available
-if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
-  GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")"
-  if [ -d "${GH_SUM_DIR}" ]; then
-    if [ -f "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" ]; then
-      cat "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" >> "${GITHUB_STEP_SUMMARY}" || true
-    fi
-  fi
-fi
diff --git a/runners/launch_h200-nv.sh b/runners/launch_h200-nv.sh
index ca2ea6079..5319f8959 100644
--- a/runners/launch_h200-nv.sh
+++ b/runners/launch_h200-nv.sh
@@ -23,13 +23,3 @@ srun --jobid=$JOB_ID \
 bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}_slurm.sh
 
 scancel $JOB_ID
-
-# Append eval summary within this same step when available
-if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
-  GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")"
-  if [ -d "${GH_SUM_DIR}" ]; then
-    if [ -f "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" ]; then
-      cat "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" >> "${GITHUB_STEP_SUMMARY}" || true
-    fi
-  fi
-fi
diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh
index 68affc9a1..eb5f8e00c 100644
--- a/runners/launch_mi325x-amd.sh
+++ b/runners/launch_mi325x-amd.sh
@@ -23,13 +23,3 @@ srun --jobid=$JOB_ID \
 bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_mi325x_slurm.sh
 
 scancel $JOB_ID
-
-# Append eval summary within this same step when available
-if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
-  GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")"
-  if [ -d "${GH_SUM_DIR}" ]; then
-    if [ -f "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" ]; then
-      cat "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" >> "${GITHUB_STEP_SUMMARY}" || true
-    fi
-  fi
-fi
diff --git a/runners/launch_mi325x-tw.sh b/runners/launch_mi325x-tw.sh
index aa87a424d..ed6ff288e 100644
--- a/runners/launch_mi325x-tw.sh
+++ b/runners/launch_mi325x-tw.sh
@@ -23,13 +23,3 @@ srun --jobid=$JOB_ID \
 bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_mi325x_slurm.sh
 
 scancel $JOB_ID
-
-# Fallback: append summary after job completes if container couldn't write directly
-if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
-  GH_SUM_DIR="$(dirname "${GITHUB_STEP_SUMMARY}")"
-  if [ -d "${GH_SUM_DIR}" ]; then
-    if [ -f "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" ]; then
-      cat "${GITHUB_WORKSPACE}/${EVAL_RESULT_DIR:-eval_out}/SUMMARY.md" >> "${GITHUB_STEP_SUMMARY}" || true
-    fi
-  fi
-fi

From 733d7ca50936e348b92cfc975fe099c184ea6bd7 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 30 Nov 2025 00:51:00 +0800
Subject: [PATCH 161/214] touch up run name

---
 .github/workflows/benchmark-tmpl.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 23ea7d518..424d67b26 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -74,7 +74,7 @@ jobs:
   benchmark:
     runs-on: ${{ inputs.runner }}
     timeout-minutes: 180
-    name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.framework }} ${{ inputs.precision }} tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} conc=${{ inputs.conc }} eval=${{ inputs.run-eval }}'
+    name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.framework }} ${{ inputs.run-eval }} ${{ inputs.precision }} tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} conc=${{ inputs.conc }}'
     steps:
       - name: Resource cleanup
         run: |

From 68c1a2db67851c9a94b5d53a2b9ceac766c7b522 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 30 Nov 2025 15:58:27 +0800
Subject: [PATCH 162/214] Missing eval env var docker

---
 .github/workflows/benchmark-tmpl.yml  |  2 +-
 benchmarks/benchmark_lib.sh           | 10 +++++-----
 benchmarks/dsr1_fp8_h200_trt_slurm.sh |  7 +++++++
 runners/launch_b200-tg.sh             |  2 +-
 runners/launch_h100-cr.sh             |  2 +-
 runners/launch_mi300x-amd.sh          |  2 +-
 runners/launch_mi300x-cr.sh           |  2 +-
 runners/launch_mi355x-amd.sh          |  2 +-
 8 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 424d67b26..20d821541 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -74,7 +74,7 @@ jobs:
   benchmark:
     runs-on: ${{ inputs.runner }}
     timeout-minutes: 180
-    name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.framework }} ${{ inputs.run-eval }} ${{ inputs.precision }} tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} conc=${{ inputs.conc }}'
+    ame: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.framework }} ${{ inputs.precision }} ${{ inputs.run-eval && ''eval '' || '''' }}tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} conc=${{ inputs.conc }}'
     steps:
       - name: Resource cleanup
         run: |
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index a2059db26..89581c65a 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -128,18 +128,18 @@ run_benchmark_serving() {
 _install_lm_eval_deps() {
     set +x
     python3 -m pip install -q --no-cache-dir "lm-eval[api]" || true
-    # Temporary: workaround known harness issue by using main
+    # Temporary: workaround issue by using main
     python3 -m pip install -q --no-cache-dir --no-deps \
         "git+https://github.com/EleutherAI/lm-evaluation-harness.git@main" || true
 }
 
 # Patch lm-eval filters to be robust to empty strings via sitecustomize
-# Patch lm-eval filters to be robust to empty strings via sitecustomize
-_patch_lm_eval_filters() {
+_patch_lm_eval() {
     set +x
     local patch_dir
     patch_dir="$(mktemp -d)"
     cat > "$patch_dir/sitecustomize.py" <<'PY'
+# --- Patch LocalChatCompletion.parse_generations to handle empty content with reasoning_content ---
 import re, sys, unicodedata, json
 from lm_eval.filters import extraction as ex
 from lm_eval.models.openai_completions import LocalChatCompletion as _LCC
@@ -167,7 +167,7 @@ def _le_parse_generations(outputs, **kwargs):
 # Keep staticmethod semantics
 _LCC.parse_generations = staticmethod(_le_parse_generations)
 
-# --- Patch TemplateAPI.apply_chat_template to avoid injecting "type": "text" ---
+# --- Patch TemplateAPI.apply_chat_template to avoid injecting "type": "text" for TRT ---
 try:
     from lm_eval.models import api_models as _api_models
     _TemplateAPI = _api_models.TemplateAPI
@@ -234,7 +234,7 @@ run_lm_eval() {
     done
 
     _install_lm_eval_deps
-    _patch_lm_eval_filters
+    _patch_lm_eval
 
     local openai_server_base="http://0.0.0.0:${port}"
     local openai_chat_base="${openai_server_base}/v1/chat/completions"
diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
index ac6bc167c..be46a1768 100644
--- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
@@ -86,3 +86,10 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    append_lm_eval_summary
+fi
+set +x
\ No newline at end of file
diff --git a/runners/launch_b200-tg.sh b/runners/launch_b200-tg.sh
index b82e25276..9709b7a87 100644
--- a/runners/launch_b200-tg.sh
+++ b/runners/launch_b200-tg.sh
@@ -24,7 +24,7 @@ docker run --rm -d --network host --name $server_name \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \
 -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
--e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \
+-e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \
  ${GH_SUM_ENV} ${GH_SUM_MOUNT} \
 --entrypoint=/bin/bash \
 $(echo "$IMAGE" | sed 's/#/\//') \
diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh
index 9815e4884..ee2dab3da 100644
--- a/runners/launch_h100-cr.sh
+++ b/runners/launch_h100-cr.sh
@@ -20,7 +20,7 @@ docker run --rm --network=host --name=$server_name \
 --runtime=nvidia --gpus=all --ipc=host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \
 -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
--e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \
+-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RUN_EVAL -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \
 -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
  ${GH_SUM_ENV} ${GH_SUM_MOUNT} \
 --entrypoint=/bin/bash \
diff --git a/runners/launch_mi300x-amd.sh b/runners/launch_mi300x-amd.sh
index 85fa1f8c7..c721c44e9 100644
--- a/runners/launch_mi300x-amd.sh
+++ b/runners/launch_mi300x-amd.sh
@@ -24,7 +24,7 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
 -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \
--e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \
+-e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL \
 ${GH_SUM_ENV} ${GH_SUM_MOUNT} \
 --entrypoint=/bin/bash \
 $IMAGE \
diff --git a/runners/launch_mi300x-cr.sh b/runners/launch_mi300x-cr.sh
index 4c9d56e7e..2084fbff0 100644
--- a/runners/launch_mi300x-cr.sh
+++ b/runners/launch_mi300x-cr.sh
@@ -24,7 +24,7 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
 -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \
--e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \
+-e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL \
  ${GH_SUM_ENV} ${GH_SUM_MOUNT} \
 --entrypoint=/bin/bash \
 $IMAGE \
diff --git a/runners/launch_mi355x-amd.sh b/runners/launch_mi355x-amd.sh
index b1b11ff95..3ab15800e 100644
--- a/runners/launch_mi355x-amd.sh
+++ b/runners/launch_mi355x-amd.sh
@@ -46,7 +46,7 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
 -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT -e NUM_PROMPTS \
--e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME  \
+-e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL \
 ${GH_SUM_ENV} ${GH_SUM_MOUNT} \
 --entrypoint=/bin/bash \
 $IMAGE \

From 6cb94a766308ce7008dd68728d673ab40a504a7a Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 30 Nov 2025 16:16:10 +0800
Subject: [PATCH 163/214] Typo

---
 .github/workflows/benchmark-tmpl.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 20d821541..f33bd8157 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -74,7 +74,7 @@ jobs:
   benchmark:
     runs-on: ${{ inputs.runner }}
     timeout-minutes: 180
-    ame: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.framework }} ${{ inputs.precision }} ${{ inputs.run-eval && ''eval '' || '''' }}tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} conc=${{ inputs.conc }}'
+    name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.framework }} ${{ inputs.precision }} ${{ inputs.run-eval && ''eval '' || '''' }}tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} conc=${{ inputs.conc }}'
     steps:
       - name: Resource cleanup
         run: |

From bc472c3b36aef3642b6063e35fce3defae07bb92 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Mon, 1 Dec 2025 00:19:44 +0800
Subject: [PATCH 164/214] Add proper coverage

---
 utils/matrix-logic/generate_sweep_configs.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py
index 1110e3c30..a9afc2bc9 100644
--- a/utils/matrix-logic/generate_sweep_configs.py
+++ b/utils/matrix-logic/generate_sweep_configs.py
@@ -89,16 +89,24 @@ def validate_matrix_output(matrix_values: List[dict]) -> List[dict]:
 def mark_eval_entries(matrix_values: List[dict]) -> List[dict]:
     """Mark entries that should run evaluation.
     
-    For each unique (model, runner, isl, osl) combination:
+    For each unique (model, runner, framework, precision, isl, osl) combination:
     - Mark highest TP with highest conc
     - Mark lowest TP with highest conc
     """
     from collections import defaultdict
     
-    # Group entries by (model, runner, isl, osl)
+    # Group entries by (model, runner, framework, precision, isl, osl)
+    # This ensures we compare within the same configuration, not across different frameworks
     groups = defaultdict(list)
     for i, entry in enumerate(matrix_values):
-        key = (entry[FIELD_MODEL], entry[FIELD_RUNNER], entry[FIELD_ISL], entry[FIELD_OSL])
+        key = (
+            entry[FIELD_MODEL], 
+            entry[FIELD_RUNNER], 
+            entry[FIELD_FRAMEWORK],
+            entry[FIELD_PRECISION],
+            entry[FIELD_ISL], 
+            entry[FIELD_OSL]
+        )
         groups[key].append((i, entry))
     
     # For each group, find highest TP/highest conc and lowest TP/highest conc

From 2461447a4f1052de4e1b2d321273fbeb2a18cd28 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Wed, 3 Dec 2025 00:09:41 +0800
Subject: [PATCH 165/214] Add evals

---
 .github/workflows/benchmark-tmpl.yml  |  24 ++-
 .github/workflows/collect-evals.yml   |  45 +++++
 .github/workflows/full-sweep-test.yml |  48 ++++++
 utils/collect_eval_results.py         | 237 ++++++++++++++++++++++++++
 4 files changed, 353 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/collect-evals.yml
 create mode 100644 utils/collect_eval_results.py

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index d5dc1e1b8..c8d6e2764 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -135,6 +135,9 @@ jobs:
         env:
           RUNNER_NAME: ${{ runner.name }}
           RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_tp${{ env.TP }}_ep${{ env.EP_SIZE }}_dpa_${{ env.DP_ATTENTION }}_conc${{ env.CONC }}_${{ runner.name }}
+          # Suppress per-job eval markdown from being appended to the step summary.
+          # We'll publish a single combined eval table in the collection job instead.
+          GITHUB_STEP_SUMMARY: ''
         run: |
           bash ./runners/launch_${RUNNER_NAME%%_*}.sh
           FOUND_RESULT_FILE=
@@ -162,4 +165,23 @@ jobs:
         uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         with:
           name: ${{ env.RESULT_FILENAME }}
-          path: agg_${{ env.RESULT_FILENAME }}.json
\ No newline at end of file
+          path: agg_${{ env.RESULT_FILENAME }}.json
+
+      - name: Upload eval results (if any)
+        if: ${{ env.RUN_EVAL == 'true' }}
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
+        with:
+          name: eval_${{ env.EXP_NAME }}_${{ env.RESULT_FILENAME }}
+          path: eval_out/${{ env.RESULT_FILENAME }}
+
+      - name: Cleanup eval outputs (post-upload)
+        if: ${{ env.RUN_EVAL == 'true' }}
+        run: |
+          if [ -n "${RESULT_FILENAME:-}" ] && [ -e "eval_out/${RESULT_FILENAME}" ]; then
+            echo "Removing eval dir: eval_out/${RESULT_FILENAME}"
+            rm -rf --one-file-system "eval_out/${RESULT_FILENAME}" || rm -rf "eval_out/${RESULT_FILENAME}" || true
+          fi
+          # Also remove empty parent folder if present
+          if [ -d "eval_out" ]; then
+            rmdir eval_out 2>/dev/null || true
+          fi
diff --git a/.github/workflows/collect-evals.yml b/.github/workflows/collect-evals.yml
new file mode 100644
index 000000000..6f7858238
--- /dev/null
+++ b/.github/workflows/collect-evals.yml
@@ -0,0 +1,45 @@
+name: Template - Collect Evals
+
+on:
+  workflow_call:
+    inputs:
+      exp-name:
+        required: false
+        type: string
+        default: ''
+
+permissions:
+  contents: read
+
+jobs:
+  collect-evals:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          token: ${{ secrets.REPO_PAT }}
+          fetch-depth: 0
+
+      - name: Download eval artifacts
+        uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
+        with:
+          path: eval_results/
+          pattern: ${{ inputs.exp-name && format('eval_{0}_*', inputs.exp-name) || 'eval_*' }}
+
+      - name: Summarize evals
+        run: |
+          echo "## 📋 Eval Summary - ${{ inputs.exp-name || 'all' }}" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          python3 utils/collect_eval_results.py eval_results/ ${{ inputs.exp-name || 'all' }} >> $GITHUB_STEP_SUMMARY
+
+      - name: Upload aggregated evals
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
+        with:
+          name: eval_results_${{ inputs.exp-name || 'all' }}
+          path: agg_eval_${{ inputs.exp-name || 'all' }}.json
+
+      - name: Cleanup downloaded eval artifacts
+        if: ${{ always() }}
+        run: |
+          rm -rf eval_results/ || true
diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml
index 8d96d5ac3..ad6bbb5ac 100644
--- a/.github/workflows/full-sweep-test.yml
+++ b/.github/workflows/full-sweep-test.yml
@@ -140,6 +140,14 @@ jobs:
         with:
             exp-name: "dsr1_1k1k"
 
+    collect-dsr1-1k1k-evals:
+        needs: benchmark-dsr1-1k1k
+        if: ${{ always() && needs.get-configs.outputs.dsr1-1k1k != '[]' }}
+        uses: ./.github/workflows/collect-evals.yml
+        secrets: inherit
+        with:
+            exp-name: "dsr1_1k1k"
+
     # GPTOSS 1K1K Benchmarks
     benchmark-gptoss-1k1k:
         needs: get-configs
@@ -175,6 +183,14 @@ jobs:
         with:
             exp-name: "gptoss_1k1k"
 
+    collect-gptoss-1k1k-evals:
+        needs: benchmark-gptoss-1k1k
+        if: ${{ always() && needs.get-configs.outputs.gptoss-1k1k != '[]' }}
+        uses: ./.github/workflows/collect-evals.yml
+        secrets: inherit
+        with:
+            exp-name: "gptoss_1k1k"
+
 
     # DSR1 8K1K Benchmarks
     benchmark-dsr1-8k1k:
@@ -211,6 +227,14 @@ jobs:
         with:
             exp-name: "dsr1_8k1k"
 
+    collect-dsr1-8k1k-evals:
+        needs: benchmark-dsr1-8k1k
+        if: ${{ always() && needs.get-configs.outputs.dsr1-8k1k != '[]' }}
+        uses: ./.github/workflows/collect-evals.yml
+        secrets: inherit
+        with:
+            exp-name: "dsr1_8k1k"
+
     # GPTOSS 8K1K Benchmarks
     benchmark-gptoss-8k1k:
         needs: get-configs
@@ -246,6 +270,14 @@ jobs:
         with:
             exp-name: "gptoss_8k1k"
 
+    collect-gptoss-8k1k-evals:
+        needs: benchmark-gptoss-8k1k
+        if: ${{ always() && needs.get-configs.outputs.gptoss-8k1k != '[]' }}
+        uses: ./.github/workflows/collect-evals.yml
+        secrets: inherit
+        with:
+            exp-name: "gptoss_8k1k"
+
 
     # DSR1 1K8K Benchmarks
     benchmark-dsr1-1k8k:
@@ -376,6 +408,14 @@ jobs:
         with:
             exp-name: "dsr1_1k8k"
 
+    collect-dsr1-1k8k-evals:
+        needs: benchmark-dsr1-1k8k
+        if: ${{ always() && needs.get-configs.outputs.dsr1-1k8k != '[]' }}
+        uses: ./.github/workflows/collect-evals.yml
+        secrets: inherit
+        with:
+            exp-name: "dsr1_1k8k"
+
 
     # GPTOSS 1K8K Benchmarks
     benchmark-gptoss-1k8k:
@@ -412,6 +452,14 @@ jobs:
         with:
             exp-name: "gptoss_1k8k"
 
+    collect-gptoss-1k8k-evals:
+        needs: benchmark-gptoss-1k8k
+        if: ${{ always() && needs.get-configs.outputs.gptoss-1k8k != '[]' }}
+        uses: ./.github/workflows/collect-evals.yml
+        secrets: inherit
+        with:
+            exp-name: "gptoss_1k8k"
+
 
     calc-success-rate:
         needs:
diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py
new file mode 100644
index 000000000..de2af26e4
--- /dev/null
+++ b/utils/collect_eval_results.py
@@ -0,0 +1,237 @@
+#!/usr/bin/env python3
+import os
+import sys
+import json
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+
+def find_eval_sets(root: Path) -> List[Path]:
+    """Return directories that contain a meta_env.json (one set per job)."""
+    out: List[Path] = []
+    for p in root.rglob('meta_env.json'):
+        out.append(p.parent)
+    return out
+
+
+def load_json(path: Path) -> Optional[Dict[str, Any]]:
+    try:
+        with open(path, 'r') as f:
+            return json.load(f)
+    except Exception:
+        return None
+
+
+def detect_eval_jsons(d: Path) -> Tuple[Optional[Path], Optional[Path]]:
+    """Return (lm_eval_json, lighteval_json) if present (latest by mtime)."""
+    lm: List[Tuple[float, Path]] = []
+    le: List[Tuple[float, Path]] = []
+    for p in d.rglob('*.json'):
+        if p.name == 'meta_env.json':
+            continue
+        data = load_json(p)
+        if not isinstance(data, dict):
+            continue
+        # Heuristics similar to utils/lm_eval_to_md.py
+        if 'lm_eval_version' in data or 'pretty_env_info' in data:
+            try:
+                lm.append((p.stat().st_mtime, p))
+            except Exception:
+                lm.append((0, p))
+        elif 'config_general' in data and 'results' in data:
+            try:
+                le.append((p.stat().st_mtime, p))
+            except Exception:
+                le.append((0, p))
+        elif 'results' in data:
+            # Fallback: treat as lm-eval JSON
+            try:
+                lm.append((p.stat().st_mtime, p))
+            except Exception:
+                lm.append((0, p))
+    lm_path = sorted(lm, key=lambda x: x[0])[-1][1] if lm else None
+    le_path = sorted(le, key=lambda x: x[0])[-1][1] if le else None
+    return lm_path, le_path
+
+
+def parse_pretty_env(pretty_env: str) -> str:
+    try:
+        lines = [l for l in pretty_env.splitlines() if l.startswith('GPU ')]
+        names = [l.split(':', 1)[1].strip() for l in lines]
+        if not names:
+            return 'Unknown GPU'
+        # Compress identical names (roughly)
+        from collections import Counter
+        c = Counter(names)
+        return ' + '.join([f"{n}× {name}" for name, n in c.items()])
+    except Exception:
+        return 'Unknown GPU'
+
+
+def extract_lm_metrics(json_path: Path, task: Optional[str] = None) -> Dict[str, Any]:
+    data = load_json(json_path) or {}
+    results = data.get('results') or {}
+    # Pick task
+    t = task
+    if not t:
+        if isinstance(results, dict) and results:
+            t = next(iter(results.keys()))
+        else:
+            t = 'unknown'
+
+    res = results.get(t, {}) if isinstance(results, dict) else {}
+    strict = res.get('exact_match,strict-match')
+    flex = res.get('exact_match,flexible-extract')
+    strict_se = res.get('exact_match_stderr,strict-match')
+    flex_se = res.get('exact_match_stderr,flexible-extract')
+
+    n_eff = None
+    ns = data.get('n-samples') or data.get('n_samples') or {}
+    if isinstance(ns, dict):
+        td = ns.get(t) or {}
+        if isinstance(td, dict):
+            n_eff = td.get('effective') or td.get('n_eff')
+
+    hardware = 'Unknown GPU'
+    pe = data.get('pretty_env_info')
+    if isinstance(pe, str) and pe:
+        hardware = parse_pretty_env(pe)
+
+    model = (
+        data.get('model_name')
+        or (data.get('configs', {}).get(t, {}) or {}).get('metadata', {}).get('model')
+        or (data.get('config') or {}).get('model')
+        or ''
+    )
+
+    return {
+        'task': t,
+        'strict': strict,
+        'flex': flex,
+        'strict_se': strict_se,
+        'flex_se': flex_se,
+        'n_eff': n_eff,
+        'hardware': hardware,
+        'model': model,
+        'source': str(json_path)
+    }
+
+
+def extract_lighteval_metrics(json_path: Path, task_base: Optional[str] = None) -> Dict[str, Any]:
+    data = load_json(json_path) or {}
+    results = data.get('results', {}) or {}
+    # Choose a task key starting with task_base if provided, else 'all', else first key
+    key = None
+    if task_base:
+        for k in results.keys():
+            if str(k).startswith(task_base):
+                key = k
+                break
+    if key is None:
+        key = 'all' if 'all' in results else (next(iter(results.keys())) if results else 'unknown')
+    r = results.get(key, {}) if isinstance(results, dict) else {}
+    em = r.get('extractive_match')
+    em_se = r.get('extractive_match_stderr')
+
+    model = ''
+    cg = data.get('config_general', {}) or {}
+    model = cg.get('model_name') or cg.get('model_config', {}).get('model_name', '')
+
+    return {
+        'task': key,
+        'strict': em,
+        'flex': None,
+        'strict_se': em_se,
+        'flex_se': None,
+        'n_eff': None,
+        'hardware': 'Unknown GPU',
+        'model': model,
+        'source': str(json_path)
+    }
+
+
+def pct(x: Any) -> str:
+    try:
+        return f"{float(x)*100:.2f}%"
+    except Exception:
+        return 'N/A'
+
+
+def se(x: Any) -> str:
+    try:
+        return f" ±{float(x)*100:.2f}%"
+    except Exception:
+        return ''
+
+
+def main():
+    if len(sys.argv) < 3:
+        print('Usage: collect_eval_results.py <results_dir> <exp_name>')
+        sys.exit(1)
+
+    root = Path(sys.argv[1])
+    exp_name = sys.argv[2] or 'all'
+
+    rows: List[Dict[str, Any]] = []
+    for d in find_eval_sets(root):
+        meta = load_json(d / 'meta_env.json') or {}
+        lm_path, le_path = detect_eval_jsons(d)
+        # Prefer lm-eval when available, else lighteval
+        if lm_path:
+            m = extract_lm_metrics(lm_path)
+        elif le_path:
+            m = extract_lighteval_metrics(le_path)
+        else:
+            continue
+
+        # Merge with meta
+        row = {
+            'model': m.get('model') or meta.get('model') or 'unknown',
+            'hw': m.get('hardware', 'Unknown GPU'),
+            'framework': (meta.get('framework') or 'unknown').lower(),
+            'precision': (meta.get('precision') or 'unknown').lower(),
+            'tp': int(meta.get('tp') or 1),
+            'ep': int(meta.get('ep') or 1),
+            'dp_attention': str(meta.get('dp_attention') or 'false'),
+            'task': m.get('task') or 'unknown',
+            'em_strict': m.get('strict'),
+            'em_strict_se': m.get('strict_se'),
+            'em_flexible': m.get('flex'),
+            'em_flexible_se': m.get('flex_se'),
+            'n_eff': m.get('n_eff'),
+            'source': m.get('source'),
+        }
+        rows.append(row)
+
+    # Sort for stable output
+    rows.sort(key=lambda r: (r.get('model',''), r.get('hw',''), r.get('framework',''), r.get('precision',''), r.get('tp',0), r.get('ep',0)))
+
+    if not rows:
+        print('> No eval results found to summarize.')
+    else:
+        # Print Markdown summary table
+        print('| Model | Hardware | Framework | Precision | TP | EP | DPA | Task | EM Strict | EM Flexible | N (eff) |')
+        print('| :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |')
+        for r in rows:
+            print(
+                f"| {r['model']} "
+                f"| {r['hw']} "
+                f"| {r['framework'].upper()} "
+                f"| {r['precision'].upper()} "
+                f"| {r['tp']} "
+                f"| {r['ep']} "
+                f"| {r['dp_attention']} "
+                f"| {r['task']} "
+                f"| {pct(r['em_strict'])}{se(r['em_strict_se'])} "
+                f"| {pct(r['em_flexible'])}{se(r['em_flexible_se'])} "
+                f"| {r['n_eff'] or ''} |"
+            )
+
+    # Write JSON aggregate
+    out_path = Path(f'agg_eval_{exp_name}.json')
+    with open(out_path, 'w') as f:
+        json.dump(rows, f, indent=2)
+
+
+if __name__ == '__main__':
+    main()

From 710d4280a0ac0527d5765a6e6da6660235e68710 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Wed, 3 Dec 2025 01:12:29 +0800
Subject: [PATCH 166/214] Cam's solution

---
 .github/workflows/benchmark-tmpl.yml | 17 +++--
 benchmarks/benchmark_lib.sh          | 28 ++++++++-
 utils/collect_eval_results.py        | 92 ++++++++++++++++++++--------
 3 files changed, 98 insertions(+), 39 deletions(-)

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index c8d6e2764..559f24fc9 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -172,16 +172,15 @@ jobs:
         uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         with:
           name: eval_${{ env.EXP_NAME }}_${{ env.RESULT_FILENAME }}
-          path: eval_out/${{ env.RESULT_FILENAME }}
+          path: |
+            SUMMARY.md
+            meta_env.json
+            results*.json
+          if-no-files-found: ignore
 
       - name: Cleanup eval outputs (post-upload)
         if: ${{ env.RUN_EVAL == 'true' }}
         run: |
-          if [ -n "${RESULT_FILENAME:-}" ] && [ -e "eval_out/${RESULT_FILENAME}" ]; then
-            echo "Removing eval dir: eval_out/${RESULT_FILENAME}"
-            rm -rf --one-file-system "eval_out/${RESULT_FILENAME}" || rm -rf "eval_out/${RESULT_FILENAME}" || true
-          fi
-          # Also remove empty parent folder if present
-          if [ -d "eval_out" ]; then
-            rmdir eval_out 2>/dev/null || true
-          fi
+          rm -f SUMMARY.md meta_env.json || true
+          # Remove any eval results JSONs that were moved into workspace
+          rm -f results*.json || true
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 8fcf9a707..ad75fc9bd 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -326,9 +326,28 @@ META
         fi
     fi
 
-    # Note: Per policy, eval outputs stay under /tmp only; do not copy to workspace.
+    # Move eval artifacts into PWD (no new directories in workspace)
+    if [ -f "${summary_md}" ]; then
+        mv -f "${summary_md}" ./ || true
+    fi
+    if [ -f "${meta_json}" ]; then
+        mv -f "${meta_json}" ./ || true
+    fi
+    if [ -d "${out_dir}" ]; then
+        while IFS= read -r -d '' jf; do
+            base=$(basename "$jf")
+            if [ "$base" != "meta_env.json" ] && [ "$base" != "SUMMARY.md" ]; then
+                mv -f "$jf" ./ || true
+            fi
+        done < <(find "${out_dir}" -type f -name "*.json" -print0 2>/dev/null)
+    fi
 
-    echo "Results saved to: ${summary_md}"
+    # Best-effort cleanup of the temp directory
+    if [ -n "${out_dir}" ] && [ -d "${out_dir}" ]; then
+        rm -rf --one-file-system "${out_dir}" || rm -rf "${out_dir}" || true
+    fi
+
+    echo "Moved eval artifacts to: $(pwd)"
 }
 
 # ------------------------------
@@ -565,7 +584,7 @@ run_lighteval_eval() {
     local port="${PORT:-8888}"
     local task="${EVAL_TASK:-gsm8k}"
     local num_fewshot="${NUM_FEWSHOT:-5}"
-    local results_dir="${EVAL_RESULT_DIR:-eval_out_lighteval}"
+    local results_dir="${EVAL_RESULT_DIR:-$(mktemp -d /tmp/eval_out-XXXXXX)}"
     local max_samples=0
     local concurrent_requests=32
 
@@ -611,6 +630,9 @@ run_lighteval_eval() {
         output_dir="/workspace/${results_dir}"
     fi
 
+    # Make output dir visible to append_lm_eval_summary
+    export EVAL_RESULT_DIR="$output_dir"
+
     set -x
     lighteval endpoint litellm \
         "${MODEL_ARGS}" \
diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py
index de2af26e4..51b2a71b6 100644
--- a/utils/collect_eval_results.py
+++ b/utils/collect_eval_results.py
@@ -7,8 +7,29 @@
 
 
 def find_eval_sets(root: Path) -> List[Path]:
-    """Return directories that contain a meta_env.json (one set per job)."""
+    """Return directories that contain a meta_env.json (one set per job).
+
+    New structure: each downloaded artifact is placed under
+    eval_results/<artifact-name>/ with flat files inside, e.g.:
+      - meta_env.json
+      - SUMMARY.md
+      - results_*.json
+
+    We first check immediate child directories for meta_env.json to avoid
+    descending unnecessarily. If nothing is found (backward compatibility),
+    fall back to recursive search.
+    """
     out: List[Path] = []
+    # Prefer immediate children (one directory per artifact)
+    try:
+        for d in root.iterdir():
+            if d.is_dir() and (d / 'meta_env.json').exists():
+                out.append(d)
+    except Exception:
+        pass
+    if out:
+        return out
+    # Fallback: recursive (legacy structure)
     for p in root.rglob('meta_env.json'):
         out.append(p.parent)
     return out
@@ -23,32 +44,49 @@ def load_json(path: Path) -> Optional[Dict[str, Any]]:
 
 
 def detect_eval_jsons(d: Path) -> Tuple[Optional[Path], Optional[Path]]:
-    """Return (lm_eval_json, lighteval_json) if present (latest by mtime)."""
-    lm: List[Tuple[float, Path]] = []
-    le: List[Tuple[float, Path]] = []
-    for p in d.rglob('*.json'):
-        if p.name == 'meta_env.json':
-            continue
-        data = load_json(p)
-        if not isinstance(data, dict):
-            continue
-        # Heuristics similar to utils/lm_eval_to_md.py
-        if 'lm_eval_version' in data or 'pretty_env_info' in data:
-            try:
-                lm.append((p.stat().st_mtime, p))
-            except Exception:
-                lm.append((0, p))
-        elif 'config_general' in data and 'results' in data:
-            try:
-                le.append((p.stat().st_mtime, p))
-            except Exception:
-                le.append((0, p))
-        elif 'results' in data:
-            # Fallback: treat as lm-eval JSON
-            try:
-                lm.append((p.stat().st_mtime, p))
-            except Exception:
-                lm.append((0, p))
+    """Return (lm_eval_json, lighteval_json) if present (latest by mtime).
+
+    New structure places result JSONs flat in the artifact directory. We
+    first check only the immediate directory for JSONs, then fall back to
+    recursive search for backward compatibility.
+    """
+    def scan_jsons(paths: List[Path]) -> Tuple[List[Tuple[float, Path]], List[Tuple[float, Path]]]:
+        lm: List[Tuple[float, Path]] = []
+        le: List[Tuple[float, Path]] = []
+        for p in paths:
+            if p.name == 'meta_env.json':
+                continue
+            data = load_json(p)
+            if not isinstance(data, dict):
+                continue
+            # Heuristics similar to utils/lm_eval_to_md.py
+            if 'lm_eval_version' in data or 'pretty_env_info' in data:
+                try:
+                    lm.append((p.stat().st_mtime, p))
+                except Exception:
+                    lm.append((0, p))
+            elif 'config_general' in data and 'results' in data:
+                try:
+                    le.append((p.stat().st_mtime, p))
+                except Exception:
+                    le.append((0, p))
+            elif 'results' in data:
+                # Fallback: treat as lm-eval JSON
+                try:
+                    lm.append((p.stat().st_mtime, p))
+                except Exception:
+                    lm.append((0, p))
+        return lm, le
+
+    # 1) Prefer immediate JSONs (flat structure)
+    immediate_jsons = list(d.glob('results*.json')) + [p for p in d.glob('*.json') if p.name != 'meta_env.json']
+    lm, le = scan_jsons(immediate_jsons)
+
+    # 2) If nothing found, fallback to deep scan (legacy)
+    if not lm and not le:
+        deep_jsons = list(d.rglob('*.json'))
+        lm, le = scan_jsons(deep_jsons)
+
     lm_path = sorted(lm, key=lambda x: x[0])[-1][1] if lm else None
     le_path = sorted(le, key=lambda x: x[0])[-1][1] if le else None
     return lm_path, le_path

From 3c8b9bc792203fb00e246d1cecd3fb12b27e8044 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Wed, 3 Dec 2025 03:31:26 +0800
Subject: [PATCH 167/214] b200 scancel fix

---
 runners/launch_b200-nb.sh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/runners/launch_b200-nb.sh b/runners/launch_b200-nb.sh
index f9b68c025..9a3dfa909 100644
--- a/runners/launch_b200-nb.sh
+++ b/runners/launch_b200-nb.sh
@@ -13,5 +13,3 @@ srun --partition=$PARTITION --gres=gpu:$TP --exclusive \
 --container-workdir=/workspace/ \
 --no-container-entrypoint --export=ALL,PORT_OFFSET=${USER: -1} \
 bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_slurm.sh
-
-scancel $JOB_ID

From 1390c5230f32c68fbd1b64d4e5e013a2ba12a868 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Wed, 3 Dec 2025 04:08:08 +0800
Subject: [PATCH 168/214] Change to 2 fewshot, forgot eval env var in b200

---
 benchmarks/benchmark_lib.sh           | 2 +-
 benchmarks/dsr1_fp4_b200_trt_slurm.sh | 7 +++++++
 benchmarks/dsr1_fp8_b200_trt_slurm.sh | 7 +++++++
 runners/launch_b200-dgxc.sh           | 2 +-
 runners/launch_b200-nb.sh             | 2 +-
 runners/launch_b200-nvd.sh            | 2 +-
 6 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index ad75fc9bd..e68e1b21d 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -242,7 +242,7 @@ PY
 run_lm_eval() {
     local port="${PORT:-8888}"
     local task="${EVAL_TASK:-gsm8k}"
-    local num_fewshot="${NUM_FEWSHOT:-5}"
+    local num_fewshot="${NUM_FEWSHOT:-2}"
     local results_dir="${EVAL_RESULT_DIR:-$(mktemp -d /tmp/eval_out-XXXXXX)}"
     local gen_max_tokens=4096
     local temperature=0
diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
index aa2be7648..f4165b72a 100644
--- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
@@ -116,3 +116,10 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    append_lm_eval_summary
+fi
+set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh
index 58d24a7ed..c77f5277f 100644
--- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh
@@ -86,3 +86,10 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    append_lm_eval_summary
+fi
+set +x
\ No newline at end of file
diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh
index 4d8ec0aed..25d09313e 100644
--- a/runners/launch_b200-dgxc.sh
+++ b/runners/launch_b200-dgxc.sh
@@ -41,7 +41,7 @@ docker run --rm --init --network host --name $server_name \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE -e DP_ATTENTION \
 -e NCCL_GRAPH_REGISTER=0 \
 -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
--e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e NUM_PROMPTS \
+-e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e NUM_PROMPTS -e RUN_EVAL \
 --entrypoint=/bin/bash \
 $(echo "$IMAGE" | sed 's/#/\//') \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh"
diff --git a/runners/launch_b200-nb.sh b/runners/launch_b200-nb.sh
index 9a3dfa909..44392e3aa 100644
--- a/runners/launch_b200-nb.sh
+++ b/runners/launch_b200-nb.sh
@@ -12,4 +12,4 @@ srun --partition=$PARTITION --gres=gpu:$TP --exclusive \
 --no-container-mount-home --container-writable \
 --container-workdir=/workspace/ \
 --no-container-entrypoint --export=ALL,PORT_OFFSET=${USER: -1} \
-bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_slurm.sh
+bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_slurm.sh
\ No newline at end of file
diff --git a/runners/launch_b200-nvd.sh b/runners/launch_b200-nvd.sh
index ebfa67458..c6ae289bb 100644
--- a/runners/launch_b200-nvd.sh
+++ b/runners/launch_b200-nvd.sh
@@ -42,7 +42,7 @@ docker run --rm --init --network host --name $server_name \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE -e DP_ATTENTION \
 -e NCCL_GRAPH_REGISTER=0 \
 -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
--e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e NUM_PROMPTS \
+-e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e NUM_PROMPTS -e RUN_EVAL \
 --entrypoint=/bin/bash \
 $(echo "$IMAGE" | sed 's/#/\//') \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh"

From 544e6986cd4e948a12aebf53ab9ad6a1837d05c3 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Wed, 3 Dec 2025 16:26:57 +0800
Subject: [PATCH 169/214] Resolve issues

---
 .github/workflows/benchmark-tmpl.yml          |   3 +-
 .github/workflows/collect-evals.yml           |   2 +-
 .github/workflows/drain-b200-nvd2.yml         |  32 --
 .../workflows/full-sweep-1k1k-scheduler.yml   |   4 +-
 .../workflows/full-sweep-1k8k-scheduler.yml   |   4 +-
 .../workflows/full-sweep-8k1k-scheduler.yml   |   4 +-
 .github/workflows/full-sweep-test.yml         |  12 +-
 benchmarks/benchmark_lib.sh                   |  51 ++-
 benchmarks/gptoss_fp4_b200_trt_slurm.sh       |   7 +
 utils/collect_eval_results.py                 | 108 ++++++-
 utils/evals/custom_gsm8k.py                   |   4 +-
 utils/evals/gsm8k.yaml                        |   2 +
 utils/lm_eval_to_md.py                        | 304 ------------------
 utils/matrix-logic/generate_sweep_configs.py  |  10 +-
 14 files changed, 158 insertions(+), 389 deletions(-)
 delete mode 100644 .github/workflows/drain-b200-nvd2.yml
 delete mode 100644 utils/lm_eval_to_md.py

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 559f24fc9..8a8943628 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -173,7 +173,6 @@ jobs:
         with:
           name: eval_${{ env.EXP_NAME }}_${{ env.RESULT_FILENAME }}
           path: |
-            SUMMARY.md
             meta_env.json
             results*.json
           if-no-files-found: ignore
@@ -181,6 +180,6 @@ jobs:
       - name: Cleanup eval outputs (post-upload)
         if: ${{ env.RUN_EVAL == 'true' }}
         run: |
-          rm -f SUMMARY.md meta_env.json || true
+          rm -f meta_env.json || true
           # Remove any eval results JSONs that were moved into workspace
           rm -f results*.json || true
diff --git a/.github/workflows/collect-evals.yml b/.github/workflows/collect-evals.yml
index 6f7858238..c45842ef2 100644
--- a/.github/workflows/collect-evals.yml
+++ b/.github/workflows/collect-evals.yml
@@ -29,7 +29,7 @@ jobs:
 
       - name: Summarize evals
         run: |
-          echo "## 📋 Eval Summary - ${{ inputs.exp-name || 'all' }}" >> $GITHUB_STEP_SUMMARY
+          echo "## Eval Summary - ${{ inputs.exp-name || 'all' }}" >> $GITHUB_STEP_SUMMARY
           echo "" >> $GITHUB_STEP_SUMMARY
           python3 utils/collect_eval_results.py eval_results/ ${{ inputs.exp-name || 'all' }} >> $GITHUB_STEP_SUMMARY
 
diff --git a/.github/workflows/drain-b200-nvd2.yml b/.github/workflows/drain-b200-nvd2.yml
deleted file mode 100644
index 08646c3b4..000000000
--- a/.github/workflows/drain-b200-nvd2.yml
+++ /dev/null
@@ -1,32 +0,0 @@
-name: Drain b200-nvd_2
-
-on:
-  push:
-    paths:
-      - '.github/workflows/drain-b200-nvd2.yml'
-
-jobs:
-  hold:
-    # Pin specifically to the self-hosted runner label for b200-nvd_2
-    runs-on: [self-hosted, b200-nvd_2]
-    # Hold for 24h by default (override by canceling anytime)
-    timeout-minutes: 1440
-    steps:
-      - name: Start drain
-        shell: bash
-        run: |
-          set -euo pipefail
-          echo "Holding runner: $RUNNER_NAME"
-          echo "Runner OS/Arch: $RUNNER_OS / $RUNNER_ARCH"
-          echo "Started at: $(date -Iseconds)"
-          echo "Cancel this workflow run to release the runner."
-
-      - name: Hold indefinitely (until timeout or cancel)
-        shell: bash
-        run: |
-          set -euo pipefail
-          trap 'echo "Release signal received at $(date -Iseconds)"; exit 0' INT TERM
-          while true; do
-            echo "Still holding at $(date -Iseconds)"
-            sleep 1800
-          done
diff --git a/.github/workflows/full-sweep-1k1k-scheduler.yml b/.github/workflows/full-sweep-1k1k-scheduler.yml
index f97cd093c..71b41949f 100644
--- a/.github/workflows/full-sweep-1k1k-scheduler.yml
+++ b/.github/workflows/full-sweep-1k1k-scheduler.yml
@@ -17,7 +17,7 @@ jobs:
             - id: get-dsr1-configs
               run: |
                   pip install pydantic
-                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1)
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1 --run-evals)
                   echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
     get-gptoss-configs:
@@ -31,7 +31,7 @@ jobs:
             - id: get-gptoss-configs
               run: |
                   pip install pydantic
-                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss)
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss --run-evals)
                   echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
     benchmark-dsr1:
diff --git a/.github/workflows/full-sweep-1k8k-scheduler.yml b/.github/workflows/full-sweep-1k8k-scheduler.yml
index cd8c07c74..9bf67c8da 100644
--- a/.github/workflows/full-sweep-1k8k-scheduler.yml
+++ b/.github/workflows/full-sweep-1k8k-scheduler.yml
@@ -17,7 +17,7 @@ jobs:
             - id: get-dsr1-configs
               run: |
                   pip install pydantic
-                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1)
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1 --run-evals)
                   echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
     get-gptoss-configs:
@@ -31,7 +31,7 @@ jobs:
             - id: get-gptoss-configs
               run: |
                   pip install pydantic
-                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss)
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss --run-evals)
                   echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
     benchmark-dsr1:
diff --git a/.github/workflows/full-sweep-8k1k-scheduler.yml b/.github/workflows/full-sweep-8k1k-scheduler.yml
index 036794eef..cfc676911 100644
--- a/.github/workflows/full-sweep-8k1k-scheduler.yml
+++ b/.github/workflows/full-sweep-8k1k-scheduler.yml
@@ -17,7 +17,7 @@ jobs:
             - id: get-dsr1-configs
               run: |
                   pip install pydantic
-                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1)
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1 --run-evals)
                   echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
     get-gptoss-configs:
@@ -31,7 +31,7 @@ jobs:
             - id: get-gptoss-configs
               run: |
                   pip install pydantic
-                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss)
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss --run-evals)
                   echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
     benchmark-dsr1:
diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml
index ad6bbb5ac..3ba954838 100644
--- a/.github/workflows/full-sweep-test.yml
+++ b/.github/workflows/full-sweep-test.yml
@@ -63,21 +63,21 @@ jobs:
 
                   # Generate dsr1 configs (only if we have valid runner types for DSR1)
                   if [ "${{ inputs.run_1k1k }}" = "true" ] && [ -n "$DSR1_RUNNER_TYPES" ]; then
-                      DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                      DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml --run-evals)
                       echo "dsr1-1k1k=$DSR1_1K1K" >> $GITHUB_OUTPUT
                   else
                       echo "dsr1-1k1k=[]" >> $GITHUB_OUTPUT
                   fi
 
                   if [ "${{ inputs.run_1k8k }}" = "true" ] && [ -n "$DSR1_RUNNER_TYPES" ]; then
-                      DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                      DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml --run-evals)
                       echo "dsr1-1k8k=$DSR1_1K8K" >> $GITHUB_OUTPUT
                   else
                       echo "dsr1-1k8k=[]" >> $GITHUB_OUTPUT
                   fi
 
                   if [ "${{ inputs.run_8k1k }}" = "true" ] && [ -n "$DSR1_RUNNER_TYPES" ]; then
-                      DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                      DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml --run-evals)
                       echo "dsr1-8k1k=$DSR1_8K1K" >> $GITHUB_OUTPUT
                   else
                       echo "dsr1-8k1k=[]" >> $GITHUB_OUTPUT
@@ -85,21 +85,21 @@ jobs:
 
                   # Generate gptoss configs (only if we have runner types selected)
                   if [ "${{ inputs.run_1k1k }}" = "true" ] && [ -n "$RUNNER_TYPES" ]; then
-                      GPTOSS_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                      GPTOSS_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml --run-evals)
                       echo "gptoss-1k1k=$GPTOSS_1K1K" >> $GITHUB_OUTPUT
                   else
                       echo "gptoss-1k1k=[]" >> $GITHUB_OUTPUT
                   fi
 
                   if [ "${{ inputs.run_1k8k }}" = "true" ] && [ -n "$RUNNER_TYPES" ]; then
-                      GPTOSS_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                      GPTOSS_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml --run-evals)
                       echo "gptoss-1k8k=$GPTOSS_1K8K" >> $GITHUB_OUTPUT
                   else
                       echo "gptoss-1k8k=[]" >> $GITHUB_OUTPUT
                   fi
 
                   if [ "${{ inputs.run_8k1k }}" = "true" ] && [ -n "$RUNNER_TYPES" ]; then
-                      GPTOSS_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                      GPTOSS_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml --run-evals)
                       echo "gptoss-8k1k=$GPTOSS_8K1K" >> $GITHUB_OUTPUT
                   else
                       echo "gptoss-8k1k=[]" >> $GITHUB_OUTPUT
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index e68e1b21d..141f66c5a 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -288,7 +288,6 @@ append_lm_eval_summary() {
     local results_dir="${EVAL_RESULT_DIR}"
     local task="${EVAL_TASK:-gsm8k}"
     local out_dir="${results_dir}"
-    local summary_md="${out_dir}/SUMMARY.md"
     mkdir -p "$out_dir" || true
 
     # Write minimal meta for collectors that expect it
@@ -296,10 +295,32 @@ append_lm_eval_summary() {
     local model_name="${MODEL_NAME:-$MODEL}"
     local dp_json="false"
     if [ "${DP_ATTENTION}" = "true" ]; then dp_json="true"; fi
+
+    # Derive framework/precision from env, fallback to parsing RESULT_FILENAME
+    # RESULT_FILENAME format (from workflow):
+    #   <exp_name>_<precision>_<framework>_tp<...>_ep<...>_dpa_<...>_conc<...>_<runner>
+    local fw="${FRAMEWORK:-}"
+    local prec="${PRECISION:-}"
+    if [[ -z "$fw" || -z "$prec" ]]; then
+        if [[ -n "${RESULT_FILENAME}" ]]; then
+            # Extract the two fields immediately before "_tp"
+            # Handles arbitrary underscores in exp_name by matching from the end
+            local parsed
+            parsed=$(echo "${RESULT_FILENAME}" | sed -n 's/.*_\([^_][^_]*\)_\([^_][^_]*\)_tp.*/\1 \2/p')
+            local p1="${parsed%% *}"
+            local p2="${parsed#* }"
+            if [[ -z "$prec" && -n "$p1" && "$p1" != "$parsed" ]]; then
+                prec="$p1"
+            fi
+            if [[ -z "$fw" && -n "$p2" && "$p2" != "$parsed" ]]; then
+                fw="$p2"
+            fi
+        fi
+    fi
     cat > "${meta_json}" <<META
 {
-  "framework": "${FRAMEWORK:-unknown}",
-  "precision": "${PRECISION:-unknown}",
+  "framework": "${fw:-unknown}",
+  "precision": "${prec:-unknown}",
   "tp": ${TP:-1},
   "ep": ${EP_SIZE:-1},
   "dp_attention": ${dp_json},
@@ -307,36 +328,14 @@ append_lm_eval_summary() {
 }
 META
 
-    PYTHONNOUSERSITE=1 PYTHONPATH="" python3 -S utils/lm_eval_to_md.py \
-        --results-dir "$out_dir" \
-        --task "${task}" \
-        --framework "${FRAMEWORK}" \
-        --precision "${PRECISION}" \
-        --tp "${TP:-1}" \
-        --ep "${EP_SIZE:-1}" \
-        --dp-attention "${DP_ATTENTION:-false}" \
-        > "$summary_md" || true
-
-    # If running inside a GitHub Actions step on this same machine, append there too
-    if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
-        local GH_SUM_DIR
-        GH_SUM_DIR="$(dirname "$GITHUB_STEP_SUMMARY")"
-        if [ -d "$GH_SUM_DIR" ] && [ -w "$GH_SUM_DIR" ]; then
-            cat "$summary_md" >> "$GITHUB_STEP_SUMMARY" || true
-        fi
-    fi
-
     # Move eval artifacts into PWD (no new directories in workspace)
-    if [ -f "${summary_md}" ]; then
-        mv -f "${summary_md}" ./ || true
-    fi
     if [ -f "${meta_json}" ]; then
         mv -f "${meta_json}" ./ || true
     fi
     if [ -d "${out_dir}" ]; then
         while IFS= read -r -d '' jf; do
             base=$(basename "$jf")
-            if [ "$base" != "meta_env.json" ] && [ "$base" != "SUMMARY.md" ]; then
+            if [ "$base" != "meta_env.json" ]; then
                 mv -f "$jf" ./ || true
             fi
         done < <(find "${out_dir}" -type f -name "*.json" -print0 2>/dev/null)
diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh
index 44e9dbf4c..56a64bb3a 100644
--- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh
+++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh
@@ -94,3 +94,10 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    append_lm_eval_summary
+fi
+set +x
diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py
index 51b2a71b6..0254d640c 100644
--- a/utils/collect_eval_results.py
+++ b/utils/collect_eval_results.py
@@ -12,7 +12,6 @@ def find_eval_sets(root: Path) -> List[Path]:
     New structure: each downloaded artifact is placed under
     eval_results/<artifact-name>/ with flat files inside, e.g.:
       - meta_env.json
-      - SUMMARY.md
       - results_*.json
 
     We first check immediate child directories for meta_env.json to avoid
@@ -51,6 +50,23 @@ def detect_eval_jsons(d: Path) -> Tuple[Optional[Path], Optional[Path]]:
     recursive search for backward compatibility.
     """
     def scan_jsons(paths: List[Path]) -> Tuple[List[Tuple[float, Path]], List[Tuple[float, Path]]]:
+        """Classify JSON files into lm-eval vs lighteval buckets.
+
+        Returns two lists of (mtime, path) where:
+          - The first list contains candidates that look like lm-eval outputs.
+          - The second list contains candidates that look like lighteval outputs.
+
+        Heuristics used (order matters):
+          - If a JSON has keys like 'lm_eval_version' or 'pretty_env_info',
+            we treat it as an lm-eval result file.
+          - If it has both 'config_general' and 'results', we treat it as
+            a lighteval result file.
+          - If it only has a top-level 'results' but none of the stronger
+            signals above, we fall back to classifying it as lm-eval.
+
+        We keep the file modification time to later choose the most recent
+        candidate; if obtaining mtime fails, we fall back to 0.
+        """
         lm: List[Tuple[float, Path]] = []
         le: List[Tuple[float, Path]] = []
         for p in paths:
@@ -59,13 +75,14 @@ def scan_jsons(paths: List[Path]) -> Tuple[List[Tuple[float, Path]], List[Tuple[
             data = load_json(p)
             if not isinstance(data, dict):
                 continue
-            # Heuristics similar to utils/lm_eval_to_md.py
             if 'lm_eval_version' in data or 'pretty_env_info' in data:
+                # lm-eval harness output
                 try:
                     lm.append((p.stat().st_mtime, p))
                 except Exception:
                     lm.append((0, p))
             elif 'config_general' in data and 'results' in data:
+                # lighteval output structure
                 try:
                     le.append((p.stat().st_mtime, p))
                 except Exception:
@@ -109,19 +126,92 @@ def parse_pretty_env(pretty_env: str) -> str:
 def extract_lm_metrics(json_path: Path, task: Optional[str] = None) -> Dict[str, Any]:
     data = load_json(json_path) or {}
     results = data.get('results') or {}
-    # Pick task
+    # Determine task key robustly:
+    # 1) explicit argument
+    # 2) only key in `results`
+    # 3) only key in `configs`
+    # 4) 'unknown'
     t = task
     if not t:
-        if isinstance(results, dict) and results:
+        if isinstance(results, dict) and len(results) == 1:
             t = next(iter(results.keys()))
         else:
-            t = 'unknown'
+            cfgs = data.get('configs') or {}
+            if isinstance(cfgs, dict) and len(cfgs) == 1:
+                t = next(iter(cfgs.keys()))
+            else:
+                # fallback to arbitrary but stable choice
+                t = next(iter(results.keys()), 'unknown') if isinstance(results, dict) else 'unknown'
 
     res = results.get(t, {}) if isinstance(results, dict) else {}
-    strict = res.get('exact_match,strict-match')
-    flex = res.get('exact_match,flexible-extract')
-    strict_se = res.get('exact_match_stderr,strict-match')
-    flex_se = res.get('exact_match_stderr,flexible-extract')
+
+    # Determine base metric name (e.g., 'exact_match')
+    base_metric: Optional[str] = None
+    hib = (data.get('higher_is_better') or {}).get(t) if isinstance(data.get('higher_is_better'), dict) else None
+    if isinstance(hib, dict) and hib:
+        base_metric = next(iter(hib.keys()))
+    if not base_metric:
+        cfg = (data.get('configs') or {}).get(t, {}) if isinstance(data.get('configs'), dict) else {}
+        ml = cfg.get('metric_list') if isinstance(cfg, dict) else None
+        if isinstance(ml, list) and ml:
+            m0 = ml[0] or {}
+            if isinstance(m0, dict):
+                base_metric = m0.get('metric')
+    if not base_metric:
+        # Fallback: infer from result keys
+        if isinstance(res, dict):
+            for k in res.keys():
+                if isinstance(k, str) and ',' in k:
+                    base_metric = k.split(',', 1)[0]
+                    break
+            if not base_metric and 'exact_match' in res:
+                base_metric = 'exact_match'
+    if not base_metric:
+        base_metric = 'exact_match'
+
+    # Determine filter names and map to strict/flexible logically without guessing
+    strict_name: Optional[str] = None
+    flex_name: Optional[str] = None
+    cfg = (data.get('configs') or {}).get(t, {}) if isinstance(data.get('configs'), dict) else {}
+    fl = cfg.get('filter_list') if isinstance(cfg, dict) else None
+    filter_names: List[str] = []
+    if isinstance(fl, list):
+        for it in fl:
+            if isinstance(it, dict):
+                nm = it.get('name')
+                if isinstance(nm, str):
+                    filter_names.append(nm)
+    # Prefer semantic names when present; otherwise preserve file order
+    for nm in filter_names:
+        if strict_name is None and 'strict' in nm.lower():
+            strict_name = nm
+        if flex_name is None and ('flex' in nm.lower() or 'extract' in nm.lower()):
+            flex_name = nm
+    # Fallback to first/second if semantic match not found
+    if not strict_name and filter_names:
+        strict_name = filter_names[0]
+    if not flex_name and len(filter_names) >= 2:
+        flex_name = filter_names[1]
+
+    # Extract metrics present in results using derived keys
+    def get_pair(fname: Optional[str]) -> Tuple[Optional[float], Optional[float]]:
+        if not fname:
+            # try unfiltered key
+            v = res.get(base_metric)
+            se = res.get(f"{base_metric}_stderr")
+            try:
+                return float(v) if v is not None else None, float(se) if se is not None else None
+            except Exception:
+                return v, se
+        v = res.get(f"{base_metric},{fname}")
+        se = res.get(f"{base_metric}_stderr,{fname}")
+        try:
+            return float(v) if v is not None else None, float(se) if se is not None else None
+        except Exception:
+            return v, se
+
+    strict, strict_se = get_pair(strict_name)
+    flex, flex_se = get_pair(flex_name)
 
     n_eff = None
     ns = data.get('n-samples') or data.get('n_samples') or {}
diff --git a/utils/evals/custom_gsm8k.py b/utils/evals/custom_gsm8k.py
index 4449188fa..5445f5732 100644
--- a/utils/evals/custom_gsm8k.py
+++ b/utils/evals/custom_gsm8k.py
@@ -1,3 +1,5 @@
+# Copied from https://github.com/huggingface/lighteval/blob/99ef5b98d422cf3620eebec9db13285493d35542/src/lighteval/tasks/tasks/gsm8k.py
+# Increases generation size to 768 from 256 to better accommodate longer solutions by dsr1.
 from lighteval.metrics.metrics import Metrics
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 from lighteval.tasks.tasks.gsm8k import gsm8k_prompt
@@ -11,7 +13,7 @@
     evaluation_splits=["test"],
     few_shots_split=None,
     few_shots_select="random_sampling_from_train",
-    generation_size=768,         # raise this as needed
+    generation_size=768,         # raised this from 256
     metrics=[Metrics.expr_gold_metric],
     stop_sequence=None,           # avoid early stop on "Question:"
     version=0,
diff --git a/utils/evals/gsm8k.yaml b/utils/evals/gsm8k.yaml
index ab3113dc2..73a1f7c1e 100644
--- a/utils/evals/gsm8k.yaml
+++ b/utils/evals/gsm8k.yaml
@@ -1,3 +1,5 @@
+# YAML from https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/gsm8k/gsm8k.yaml 
+# Changed doc_to_text so model answers properly. Also see lm-evaluation-harness#3411.
 tag:
   - math_word_problems
 task: gsm8k
diff --git a/utils/lm_eval_to_md.py b/utils/lm_eval_to_md.py
deleted file mode 100644
index 0c59bc494..000000000
--- a/utils/lm_eval_to_md.py
+++ /dev/null
@@ -1,304 +0,0 @@
-#!/usr/bin/env python3
-"""
-Convert latest lm-evaluation-harness and/or lighteval JSONs in a results dir
-into Markdown tables for GitHub Actions job summary. Prints to stdout.
-
-Usage (same as before, works even if FRAMEWORK/PRECISION env vars are empty):
-  python3 utils/lm_eval_to_md.py \
-    --results-dir /workspace/eval_out \
-    --task gsm8k \
-    --framework vLLM \
-    --precision fp16 \
-    --tp 4 \
-    --ep 1 \
-    --dp-attention false
-"""
-import argparse
-import json
-import os
-import re
-import sys
-from collections import Counter
-from glob import glob
-from typing import Optional, Tuple, Dict, Any, List
-
-
-# -----------------------
-# Helpers
-# -----------------------
-
-def pct(x):
-    return f"{x*100:.2f}%" if isinstance(x, (int, float)) else "N/A"
-
-
-def se(x):
-    return f" \u00B1{(x*100):.2f}%" if isinstance(x, (int, float)) else ""
-
-
-def gpu_cpu_from_pretty_env(pe: str):
-    if not isinstance(pe, str) or not pe:
-        return "Unknown GPU"
-    gpu_lines = [l for l in pe.splitlines() if l.startswith("GPU ")]
-    names = [re.sub(r"GPU \d+:\s*", "", l).strip() for l in gpu_lines]
-    c = Counter(names)
-    gpu_summary = " + ".join([f"{n}\u00D7 {name}" for name, n in c.items()]) if c else "Unknown GPU"
-    cpu_line = next((l.split(":", 1)[1].strip() for l in pe.splitlines() if l.startswith("Model name:")), None)
-    return gpu_summary + (f" ({cpu_line})" if cpu_line else "")
-
-
-def detect_framework_kind(data: Dict[str, Any]) -> str:
-    """
-    Classify JSON as:
-      - 'lm-eval'   : lm-evaluation-harness style JSON
-      - 'lighteval' : lighteval JSON
-      - 'unknown'   : anything else
-    """
-    # lm-eval has lm_eval_version + results structure like results["gsm8k"]...  [oai_citation:0‡results_2025-11-25T08-30-41.513104.json](sediment://file_000000001658720790705168e4c51783)
-    if "lm_eval_version" in data or "pretty_env_info" in data:
-        return "lm-eval"
-    # lighteval has config_general + config_tasks/results keyed by "<task>|<k>"  [oai_citation:1‡results_2025-11-25T08-40-05.199875.json](sediment://file_000000006f3872078dd9c458c614c1f7)
-    if "config_general" in data and "results" in data:
-        return "lighteval"
-    return "unknown"
-
-
-def find_all_jsons(results_dir: str) -> List[str]:
-    paths = []
-    for root, _, _ in os.walk(results_dir):
-        for name in glob(os.path.join(root, "*.json")):
-            paths.append(name)
-    return paths
-
-
-def find_latest_by_kind(results_dir: str) -> Tuple[Optional[str], Optional[str]]:
-    """
-    Scan all JSONs under results_dir and return:
-      (latest_lm_eval_json_path, latest_lighteval_json_path)
-    """
-    lm_eval_candidates = []
-    lighteval_candidates = []
-
-    for path in find_all_jsons(results_dir):
-        try:
-            with open(path, "r") as f:
-                data = json.load(f)
-        except Exception:
-            continue
-
-        kind = detect_framework_kind(data)
-        mtime = os.path.getmtime(path)
-        if kind == "lm-eval":
-            lm_eval_candidates.append((mtime, path))
-        elif kind == "lighteval":
-            lighteval_candidates.append((mtime, path))
-
-    lm_path = max(lm_eval_candidates, default=(None, None))[1]
-    le_path = max(lighteval_candidates, default=(None, None))[1]
-    return lm_path, le_path
-
-
-# -----------------------
-# lm-eval parsing
-# -----------------------
-
-def extract_lm_eval_metrics(data: Dict[str, Any], task: str) -> Dict[str, Any]:
-    res_all = data.get("results", {}) or {}
-    res = res_all.get(task) if isinstance(res_all, dict) else {}
-    if not res and isinstance(res_all, dict) and res_all:
-        any_key = next(iter(res_all.keys()))
-        res = res_all.get(any_key, {})
-        task = any_key
-
-    strict = res.get("exact_match,strict-match")
-    flex = res.get("exact_match,flexible-extract")
-    strict_se = res.get("exact_match_stderr,strict-match")
-    flex_se = res.get("exact_match_stderr,flexible-extract")
-
-    n_eff = None
-    ns = data.get("n-samples") or data.get("n_samples") or {}
-    if isinstance(ns, dict):
-        tdict = ns.get(task) or ns.get("gsm8k") or {}
-        if isinstance(tdict, dict):
-            n_eff = tdict.get("effective") or tdict.get("n_eff")
-
-    model = data.get("model_name") \
-        or data.get("configs", {}).get(task, {}).get("metadata", {}).get("model") \
-        or data.get("config", {}).get("model") \
-        or ""
-
-    fewshot = None
-    nshot = data.get("n-shot") or data.get("n_shot") or {}
-    if isinstance(nshot, dict):
-        fewshot = nshot.get(task) or nshot.get("gsm8k")
-
-    limit = None
-    cfg = data.get("config") or {}
-    if isinstance(cfg, dict):
-        limit = cfg.get("limit")
-
-    return {
-        "task": task,
-        "strict": strict,
-        "flex": flex,
-        "strict_se": strict_se,
-        "flex_se": flex_se,
-        "n_eff": n_eff,
-        "model": model,
-        "fewshot": fewshot,
-        "limit": limit,
-    }
-
-
-def render_lm_eval_section(path: str,
-                           args,
-                           framework_label: str,
-                           precision_label: str) -> Tuple[str, Dict[str, Any]]:
-    with open(path, "r") as f:
-        data = json.load(f)
-
-    hardware = gpu_cpu_from_pretty_env(data.get("pretty_env_info", ""))
-    m = extract_lm_eval_metrics(data, args.task)
-
-    print(f"### {args.task} Evaluation (lm-eval-harness)\n")
-    print("| Hardware | Framework | Precision | TP | EP | DP Attention | EM Strict | EM Flexible | N (eff) |")
-    print("|---|---|---:|--:|--:|:--:|--:|--:|--:|")
-    print(
-        f"| {hardware} | {framework_label} | {precision_label} | {args.tp} | {args.ep} | "
-        f"{str(args.dp_attention).lower()} | "
-        f"{pct(m['strict'])}{se(m['strict_se'])} | "
-        f"{pct(m['flex'])}{se(m['flex_se'])} | {m['n_eff'] or ''} |"
-    )
-
-    lim = m["limit"]
-    lim_str = str(int(lim)) if isinstance(lim, (int, float)) else (str(lim) if lim is not None else "")
-    fewshot = m["fewshot"] if m["fewshot"] is not None else ""
-    print(
-        f"\n_Model_: `{m['model']}` &nbsp;&nbsp; "
-        f"_k-shot_: **{fewshot}** &nbsp;&nbsp; "
-        f"_limit_: **{lim_str}**  \n"
-        f"_Source_: `{os.path.basename(path)}`"
-    )
-    return hardware, m
-
-
-# -----------------------
-# lighteval parsing
-# -----------------------
-
-def extract_lighteval_metrics(data: Dict[str, Any], task_base: str) -> Dict[str, Any]:
-    res_all = data.get("results", {}) or {}
-
-    # Prefer task-specific key like "gsm8k|5" over "all"  [oai_citation:2‡results_2025-11-25T08-40-05.199875.json](sediment://file_000000006f3872078dd9c458c614c1f7)
-    task_key = None
-    for k in res_all.keys():
-        if k.startswith(task_base):
-            task_key = k
-            break
-    if task_key is None and "all" in res_all:
-        task_key = "all"
-
-    r = res_all.get(task_key, {})
-    em = r.get("extractive_match")
-    em_se = r.get("extractive_match_stderr")
-
-    # Fewshot & other metadata from config_tasks if available
-    fewshot = None
-    cfg_tasks = data.get("config_tasks", {})
-    if isinstance(cfg_tasks, dict) and task_key in cfg_tasks:
-        fewshot = cfg_tasks[task_key].get("num_fewshots")
-
-    # Model name from config_general
-    cg = data.get("config_general", {}) or {}
-    model = cg.get("model_name") or cg.get("model_config", {}).get("model_name", "")
-
-    return {
-        "task": task_key or task_base,
-        "em": em,
-        "em_se": em_se,
-        "fewshot": fewshot,
-        "model": model,
-        # lighteval JSON you showed doesn’t expose an obvious effective N; leave blank
-        "n_eff": None,
-    }
-
-
-def render_lighteval_section(path: str,
-                             args,
-                             framework_label: str,
-                             precision_label: str,
-                             hardware_fallback: Optional[str]) -> None:
-    with open(path, "r") as f:
-        data = json.load(f)
-
-    m = extract_lighteval_metrics(data, args.task)
-    hardware = hardware_fallback or "Unknown GPU"
-
-    print(f"### {args.task} Evaluation (lighteval)\n")
-    print("| Hardware | Framework | Precision | TP | EP | DP Attention | Extractive Match | N (eff) |")
-    print("|---|---|---:|--:|--:|:--:|--:|--:|")
-    print(
-        f"| {hardware} | {framework_label} | {precision_label} | {args.tp} | {args.ep} | "
-        f"{str(args.dp_attention).lower()} | "
-        f"{pct(m['em'])}{se(m['em_se'])} | {m['n_eff'] or ''} |"
-    )
-
-    fewshot = m["fewshot"] if m["fewshot"] is not None else ""
-    print(
-        f"\n_Model_: `{m['model']}` &nbsp;&nbsp; "
-        f"_k-shot_: **{fewshot}**  \n"
-        f"_Source_: `{os.path.basename(path)}`"
-    )
-
-
-# -----------------------
-# main
-# -----------------------
-
-def main():
-    ap = argparse.ArgumentParser()
-    ap.add_argument("--results-dir", required=True)
-    ap.add_argument("--task", default="gsm8k")
-    ap.add_argument("--framework", default=os.environ.get("FRAMEWORK", ""))
-    ap.add_argument("--precision", default=os.environ.get("PRECISION", ""))
-    ap.add_argument("--tp", default=os.environ.get("TP", "1"))
-    ap.add_argument("--ep", default=os.environ.get("EP_SIZE", "1"))
-    ap.add_argument("--dp-attention", default=os.environ.get("DP_ATTENTION", "false"))
-    args = ap.parse_args()
-
-    # Robust defaults if env vars / CLI args are empty
-    framework_label = args.framework or os.environ.get("FRAMEWORK") or "unknown"
-    precision_label = args.precision or os.environ.get("PRECISION") or "unknown"
-
-    lm_path, le_path = find_latest_by_kind(args.results_dir)
-
-    if not lm_path and not le_path:
-        print(f"### {args.task} Evaluation\n")
-        print(f"> No result JSON found in `{args.results_dir}`.")
-        return
-
-    hardware_from_lm = None
-
-    # 1) lm-eval section (if present)
-    if lm_path:
-        hardware_from_lm, _ = render_lm_eval_section(
-            lm_path, args, framework_label, precision_label
-        )
-
-    # Spacer between sections if both exist
-    if lm_path and le_path:
-        print("\n")
-
-    # 2) lighteval section (if present)
-    if le_path:
-        render_lighteval_section(
-            le_path, args, framework_label, precision_label, hardware_from_lm
-        )
-
-
-if __name__ == "__main__":
-    try:
-        main()
-    except Exception as e:
-        # Never blow up the CI summary; emit a helpful line instead.
-        print(f"> Failed to render evaluation summary: {e}")
-        sys.exit(0)
\ No newline at end of file
diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py
index a9afc2bc9..a039a52f1 100644
--- a/utils/matrix-logic/generate_sweep_configs.py
+++ b/utils/matrix-logic/generate_sweep_configs.py
@@ -761,6 +761,11 @@ def main():
         required=True,
         help='One or more configuration files (YAML format)'
     )
+    parent_parser.add_argument(
+        '--run-evals',
+        action='store_true',
+        help='Opt-in flag to mark a subset of generated configs to run evals. When omitted, no evals run.'
+    )
 
     # Create main parser
     parser = argparse.ArgumentParser(
@@ -1016,8 +1021,9 @@ def main():
     else:
         parser.error(f"Unknown command: {args.command}")
 
-    # Choose eval
-    matrix_values = mark_eval_entries(matrix_values)
+    # Choose eval (opt-in via --run-evals)
+    if args.run_evals:
+        matrix_values = mark_eval_entries(matrix_values)
 
     # Validate output before printing
     validate_matrix_output(matrix_values)

From 5ec3378de2a87629a385212f77a1eea8eaa3f799 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Thu, 4 Dec 2025 13:14:30 +0800
Subject: [PATCH 170/214] Resolve issues/nits

---
 benchmarks/benchmark_lib.sh         |  7 +-----
 benchmarks/gptoss_fp4_h100_slurm.sh |  2 +-
 runners/launch_h100-cr.sh           |  1 -
 runners/launch_mi300x-amd.sh        |  1 -
 runners/launch_mi300x-cr.sh         |  1 -
 runners/launch_mi325x-amd.sh        |  1 -
 runners/launch_mi325x-tw.sh         |  1 -
 runners/launch_mi355x-amd.sh        |  1 -
 utils/collect_eval_results.py       | 10 +++++++-
 utils/evals/READMEevals.md          | 28 ++++++++++++++++++++++
 utils/evals/math500.yaml            | 36 +++++++++++++++++++++++++++++
 11 files changed, 75 insertions(+), 14 deletions(-)
 create mode 100644 utils/evals/READMEevals.md
 create mode 100644 utils/evals/math500.yaml

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 141f66c5a..2e28828a0 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -162,7 +162,7 @@ _install_lm_eval_deps() {
     python3 -m pip install -q --no-cache-dir "lm-eval[api]" || true
     # Temporary: workaround issue by using main
     python3 -m pip install -q --no-cache-dir --no-deps \
-        "git+https://github.com/EleutherAI/lm-evaluation-harness.git@main" || true
+        "git+https://github.com/EleutherAI/lm-evaluation-harness.git@b315ef3b05176acc9732bb7fdec116abe1ecc476" || true
 }
 
 # Patch lm-eval filters to be robust to empty strings via sitecustomize
@@ -450,11 +450,6 @@ def _patched___call_api(self, prompt, return_logits, max_new_tokens, num_samples
             if (not content) and reasoning:
                 return response
 
-            if not content and LITELLM_CACHE:
-                logger.info("Empty content with caching on; retrying uncached once")
-                kwargs["caching"] = False
-                response = litellm.completion(**kwargs)
-
             return response
         except litellm.BadRequestError as e:
             if "message" in e.__dict__ and "policy" in e.__dict__["message"]:
diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh
index b9f5fb958..a1321934e 100644
--- a/benchmarks/gptoss_fp4_h100_slurm.sh
+++ b/benchmarks/gptoss_fp4_h100_slurm.sh
@@ -53,7 +53,7 @@ run_benchmark_serving \
     --input-len "$ISL" \
     --output-len "$OSL" \
     --random-range-ratio "$RANDOM_RANGE_RATIO" \
-    --num-prompts $(( $CONC * 1 )) \
+    --num-prompts $(( $CONC * 10 )) \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh
index 6b5d3f9d0..1070b6de0 100644
--- a/runners/launch_h100-cr.sh
+++ b/runners/launch_h100-cr.sh
@@ -12,7 +12,6 @@ docker run --rm --network=host --name=$server_name \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RUN_EVAL -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \
 -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
- ${GH_SUM_ENV} ${GH_SUM_MOUNT} \
 --entrypoint=/bin/bash \
 $IMAGE \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_h100_docker.sh"
diff --git a/runners/launch_mi300x-amd.sh b/runners/launch_mi300x-amd.sh
index f6fe97881..e5cea1ed6 100644
--- a/runners/launch_mi300x-amd.sh
+++ b/runners/launch_mi300x-amd.sh
@@ -15,7 +15,6 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \
 -e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL \
-${GH_SUM_ENV} ${GH_SUM_MOUNT} \
 --entrypoint=/bin/bash \
 $IMAGE \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi300x_docker.sh"
diff --git a/runners/launch_mi300x-cr.sh b/runners/launch_mi300x-cr.sh
index 965ae4222..f864fec25 100644
--- a/runners/launch_mi300x-cr.sh
+++ b/runners/launch_mi300x-cr.sh
@@ -15,7 +15,6 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \
 -e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL \
- ${GH_SUM_ENV} ${GH_SUM_MOUNT} \
 --entrypoint=/bin/bash \
 $IMAGE \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi300x_docker.sh"
diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh
index eb5f8e00c..1065167d7 100644
--- a/runners/launch_mi325x-amd.sh
+++ b/runners/launch_mi325x-amd.sh
@@ -11,7 +11,6 @@ salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=256 --time=180 --no
 JOB_ID=$(squeue -u $USER -h -o %A | head -n1)
 
 srun --jobid=$JOB_ID bash -c "sudo enroot import -o $SQUASH_FILE docker://$IMAGE"
-
 srun --jobid=$JOB_ID \
 --container-image=$SQUASH_FILE \
 --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
diff --git a/runners/launch_mi325x-tw.sh b/runners/launch_mi325x-tw.sh
index ed6ff288e..488ce6ceb 100644
--- a/runners/launch_mi325x-tw.sh
+++ b/runners/launch_mi325x-tw.sh
@@ -11,7 +11,6 @@ salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=128 --time=180 --no
 JOB_ID=$(squeue -u $USER -h -o %A | head -n1)
 
 srun --jobid=$JOB_ID bash -c "sudo enroot import -o $SQUASH_FILE docker://$IMAGE"
-
 srun --jobid=$JOB_ID \
 --container-image=$SQUASH_FILE \
 --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
diff --git a/runners/launch_mi355x-amd.sh b/runners/launch_mi355x-amd.sh
index a6c64b1ee..f6507388b 100644
--- a/runners/launch_mi355x-amd.sh
+++ b/runners/launch_mi355x-amd.sh
@@ -37,7 +37,6 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT -e NUM_PROMPTS \
 -e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL \
-${GH_SUM_ENV} ${GH_SUM_MOUNT} \
 --entrypoint=/bin/bash \
 $IMAGE \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi355x_docker.sh"
diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py
index 0254d640c..84be661ce 100644
--- a/utils/collect_eval_results.py
+++ b/utils/collect_eval_results.py
@@ -313,9 +313,17 @@ def main():
             continue
 
         # Merge with meta
+        # Prefer explicit hardware identifiers from meta (if present) and fall back to parsed pretty_env_info
+        hw_meta = (
+            meta.get('hw')
+            or meta.get('runner')
+            or meta.get('RUNNER_TYPE')
+            or None
+        )
+        hw_value = hw_meta if hw_meta else m.get('hardware', 'Unknown GPU')
         row = {
             'model': m.get('model') or meta.get('model') or 'unknown',
-            'hw': m.get('hardware', 'Unknown GPU'),
+            'hw': hw_value,
             'framework': (meta.get('framework') or 'unknown').lower(),
             'precision': (meta.get('precision') or 'unknown').lower(),
             'tp': int(meta.get('tp') or 1),
diff --git a/utils/evals/READMEevals.md b/utils/evals/READMEevals.md
new file mode 100644
index 000000000..511c80804
--- /dev/null
+++ b/utils/evals/READMEevals.md
@@ -0,0 +1,28 @@
+# Evals
+
+## What?
+Quick graded QnA which measures model performance. Examples of test suites:
+- **gsm8k**: Grade school math questions
+- **gpqa**: Graduate level, Google-Proof multiple choice questions
+- **math500**: Math questions spanning topics like probability, algebra, trigonometry, and geometry.
+
+## When?
+At highest concurrency for highest TP and lowest TP, per GPU per model per ISL/OSL. Logic is defined in `mark_eval_entries` of `utils/matrix-logic/generate_sweep_configs.py`
+
+## Why?
+To verify how model outputs are affected by throughput optimizations. 
+- TP/Conc might affect model outputs
+- Check kernel implementations for correctness
+
+## How?
+- `run_eval`, definined in `benchmarks/benchmark_lib.sh`, is called in `benchmarks/*`. Either  EleutherAI/lm-evaluation-harness(lmeval) or  lighteval with litellm is ran, using the same endpoint as the throughput benchmark. JSON results are processed and converted to a table with `utils/collect_eval_results.py`.
+
+## Misc
+Following files are task definitions from lmeval, more info on changes within the files
+- `utils/evals/math500.yaml`
+- `utils/evals/gsm8k.yaml`
+Following files are task definitions from lighteval, more info on changes within the files
+- `utils/evals/custom_gsm8k.py`
+
+
+
diff --git a/utils/evals/math500.yaml b/utils/evals/math500.yaml
new file mode 100644
index 000000000..09051d118
--- /dev/null
+++ b/utils/evals/math500.yaml
@@ -0,0 +1,36 @@
+# YAML from https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/hendrycks_math/hendrycks_math_algebra.yaml
+# Changed regex and prompt
+tag:
+  - math_word_problems
+task: hendrycks_math_algebra
+dataset_path: HuggingFaceH4/MATH-500
+process_docs: !function utils.process_docs
+dataset_name: algebra
+output_type: generate_until
+training_split: train
+test_split: test
+doc_to_text: "You are solving competition math problems.\n\nFormat rules:\n- Answer in a new. That line must start with `Answer: ` (capital A, colon, one space).\n- After `Answer: `, write ONLY the answer as inline LaTeX.\n- Use ONLY ASCII LaTeX commands (e.g. \\pi, \\frac{1}{2}, -). NO Unicode symbols.\n- Do NOT wrap the answer in $, $$, \\( \\), \\[ \\], or any other delimiters.\n- Do NOT use \\displaystyle or any display-style commands. Answer only this problem, the rest are examples. Problem: {{problem}}\n"
+process_results: !function utils.process_results
+doc_to_target: "{{answer}}"
+generation_kwargs:
+  until:
+    - "Problem:"
+  do_sample: false
+  temperature: 0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    regexes_to_ignore:
+      - "\\\\left"
+      - "\\\\right"
+      - "\\s+"
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: "Answer:\\s*([^\\n]+)"
+      - function: "take_first"
+metadata:
+  version: 1.0
\ No newline at end of file

From ae4e481bc7e4ac6c9a269a690bddd580e017daf8 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Thu, 4 Dec 2025 19:48:49 +0800
Subject: [PATCH 171/214] fix summary table hardware

---
 benchmarks/benchmark_lib.sh   |  3 ++-
 utils/collect_eval_results.py | 31 +++----------------------------
 2 files changed, 5 insertions(+), 29 deletions(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 2e28828a0..2e3ad10c4 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -324,7 +324,8 @@ append_lm_eval_summary() {
   "tp": ${TP:-1},
   "ep": ${EP_SIZE:-1},
   "dp_attention": ${dp_json},
-  "model": "${model_name:-}"
+  "model": "${model_name:-}",
+  "hw": "${RUNNER_TYPE:-unknown}"
 }
 META
 
diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py
index 84be661ce..6bb771b5d 100644
--- a/utils/collect_eval_results.py
+++ b/utils/collect_eval_results.py
@@ -109,20 +109,6 @@ def scan_jsons(paths: List[Path]) -> Tuple[List[Tuple[float, Path]], List[Tuple[
     return lm_path, le_path
 
 
-def parse_pretty_env(pretty_env: str) -> str:
-    try:
-        lines = [l for l in pretty_env.splitlines() if l.startswith('GPU ')]
-        names = [l.split(':', 1)[1].strip() for l in lines]
-        if not names:
-            return 'Unknown GPU'
-        # Compress identical names (roughly)
-        from collections import Counter
-        c = Counter(names)
-        return ' + '.join([f"{n}× {name}" for name, n in c.items()])
-    except Exception:
-        return 'Unknown GPU'
-
-
 def extract_lm_metrics(json_path: Path, task: Optional[str] = None) -> Dict[str, Any]:
     data = load_json(json_path) or {}
     results = data.get('results') or {}
@@ -220,11 +206,6 @@ def get_pair(fname: Optional[str]) -> Tuple[Optional[float], Optional[float]]:
         if isinstance(td, dict):
             n_eff = td.get('effective') or td.get('n_eff')
 
-    hardware = 'Unknown GPU'
-    pe = data.get('pretty_env_info')
-    if isinstance(pe, str) and pe:
-        hardware = parse_pretty_env(pe)
-
     model = (
         data.get('model_name')
         or (data.get('configs', {}).get(t, {}) or {}).get('metadata', {}).get('model')
@@ -239,7 +220,7 @@ def get_pair(fname: Optional[str]) -> Tuple[Optional[float], Optional[float]]:
         'strict_se': strict_se,
         'flex_se': flex_se,
         'n_eff': n_eff,
-        'hardware': hardware,
+        'hardware': 'Unknown GPU',
         'model': model,
         'source': str(json_path)
     }
@@ -313,14 +294,8 @@ def main():
             continue
 
         # Merge with meta
-        # Prefer explicit hardware identifiers from meta (if present) and fall back to parsed pretty_env_info
-        hw_meta = (
-            meta.get('hw')
-            or meta.get('runner')
-            or meta.get('RUNNER_TYPE')
-            or None
-        )
-        hw_value = hw_meta if hw_meta else m.get('hardware', 'Unknown GPU')
+        # Only use explicit hardware label written to meta_env.json ('hw')
+        hw_value = meta.get('hw', 'Unknown GPU')
         row = {
             'model': m.get('model') or meta.get('model') or 'unknown',
             'hw': hw_value,

From 48a220d527da89d353c70a2cce0cfffc9161ead2 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Thu, 4 Dec 2025 21:05:26 +0800
Subject: [PATCH 172/214] fix summary table hardware

---
 .github/workflows/benchmark-tmpl.yml | 1 +
 runners/launch_b200-dgxc.sh          | 2 +-
 runners/launch_b200-nvd.sh           | 2 +-
 runners/launch_h100-cr.sh            | 2 +-
 runners/launch_mi300x-amd.sh         | 2 +-
 runners/launch_mi300x-cr.sh          | 2 +-
 runners/launch_mi355x-amd.sh         | 2 +-
 utils/collect_eval_results.py        | 5 +----
 8 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 8a8943628..d275f656c 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -134,6 +134,7 @@ jobs:
       - name: Launch job script
         env:
           RUNNER_NAME: ${{ runner.name }}
+          RUNNER_TYPE: ${{ inputs.runner }}
           RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_tp${{ env.TP }}_ep${{ env.EP_SIZE }}_dpa_${{ env.DP_ATTENTION }}_conc${{ env.CONC }}_${{ runner.name }}
           # Suppress per-job eval markdown from being appended to the step summary.
           # We'll publish a single combined eval table in the collection job instead.
diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh
index 25d09313e..8406d4bd0 100644
--- a/runners/launch_b200-dgxc.sh
+++ b/runners/launch_b200-dgxc.sh
@@ -41,7 +41,7 @@ docker run --rm --init --network host --name $server_name \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE -e DP_ATTENTION \
 -e NCCL_GRAPH_REGISTER=0 \
 -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
--e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e NUM_PROMPTS -e RUN_EVAL \
+-e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e NUM_PROMPTS -e RUN_EVAL -e RUNNER_TYPE \
 --entrypoint=/bin/bash \
 $(echo "$IMAGE" | sed 's/#/\//') \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh"
diff --git a/runners/launch_b200-nvd.sh b/runners/launch_b200-nvd.sh
index c6ae289bb..fac3063f2 100644
--- a/runners/launch_b200-nvd.sh
+++ b/runners/launch_b200-nvd.sh
@@ -42,7 +42,7 @@ docker run --rm --init --network host --name $server_name \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE -e DP_ATTENTION \
 -e NCCL_GRAPH_REGISTER=0 \
 -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
--e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e NUM_PROMPTS -e RUN_EVAL \
+-e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e NUM_PROMPTS -e RUN_EVAL -e RUNNER_TYPE \
 --entrypoint=/bin/bash \
 $(echo "$IMAGE" | sed 's/#/\//') \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh"
diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh
index 1070b6de0..0174087e4 100644
--- a/runners/launch_h100-cr.sh
+++ b/runners/launch_h100-cr.sh
@@ -10,7 +10,7 @@ docker run --rm --network=host --name=$server_name \
 --runtime=nvidia --gpus=all --ipc=host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \
 -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
--e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RUN_EVAL -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \
+-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RUN_EVAL -e RUNNER_TYPE -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \
 -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
 --entrypoint=/bin/bash \
 $IMAGE \
diff --git a/runners/launch_mi300x-amd.sh b/runners/launch_mi300x-amd.sh
index e5cea1ed6..55fffdb7c 100644
--- a/runners/launch_mi300x-amd.sh
+++ b/runners/launch_mi300x-amd.sh
@@ -14,7 +14,7 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
 -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \
--e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL \
+-e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL -e RUNNER_TYPE \
 --entrypoint=/bin/bash \
 $IMAGE \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi300x_docker.sh"
diff --git a/runners/launch_mi300x-cr.sh b/runners/launch_mi300x-cr.sh
index f864fec25..5bd6bd0e2 100644
--- a/runners/launch_mi300x-cr.sh
+++ b/runners/launch_mi300x-cr.sh
@@ -14,7 +14,7 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
 -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \
--e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL \
+-e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL -e RUNNER_TYPE \
 --entrypoint=/bin/bash \
 $IMAGE \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi300x_docker.sh"
diff --git a/runners/launch_mi355x-amd.sh b/runners/launch_mi355x-amd.sh
index f6507388b..e3f1ef8e9 100644
--- a/runners/launch_mi355x-amd.sh
+++ b/runners/launch_mi355x-amd.sh
@@ -36,7 +36,7 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
 -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT -e NUM_PROMPTS \
--e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL \
+-e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL -e RUNNER_TYPE \
 --entrypoint=/bin/bash \
 $IMAGE \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi355x_docker.sh"
diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py
index 6bb771b5d..1aeb80e30 100644
--- a/utils/collect_eval_results.py
+++ b/utils/collect_eval_results.py
@@ -293,12 +293,9 @@ def main():
         else:
             continue
 
-        # Merge with meta
-        # Only use explicit hardware label written to meta_env.json ('hw')
-        hw_value = meta.get('hw', 'Unknown GPU')
         row = {
             'model': m.get('model') or meta.get('model') or 'unknown',
-            'hw': hw_value,
+            'hw': meta.get('hw'),
             'framework': (meta.get('framework') or 'unknown').lower(),
             'precision': (meta.get('precision') or 'unknown').lower(),
             'tp': int(meta.get('tp') or 1),

From 61327ca20f78e9d60f5271ef1f039df65989c9b1 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Thu, 4 Dec 2025 21:20:34 +0800
Subject: [PATCH 173/214] fix summary table hardware 2

---
 utils/collect_eval_results.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py
index 1aeb80e30..0c188473f 100644
--- a/utils/collect_eval_results.py
+++ b/utils/collect_eval_results.py
@@ -295,7 +295,7 @@ def main():
 
         row = {
             'model': m.get('model') or meta.get('model') or 'unknown',
-            'hw': meta.get('hw'),
+            'hw': (meta.get('hw') or 'unknown').upper(),
             'framework': (meta.get('framework') or 'unknown').lower(),
             'precision': (meta.get('precision') or 'unknown').lower(),
             'tp': int(meta.get('tp') or 1),

From 1cf2967d55b615bb75073d42fa80eed684230793 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 5 Dec 2025 23:06:47 +0800
Subject: [PATCH 174/214] final touches

---
 .github/workflows/eval-gms8k.yml |  64 -------------
 .github/workflows/eval-tmpl.yml  | 152 -------------------------------
 benchmarks/benchmark_lib.sh      |   1 +
 utils/collect_eval_results.py    |  11 ++-
 4 files changed, 9 insertions(+), 219 deletions(-)
 delete mode 100644 .github/workflows/eval-gms8k.yml
 delete mode 100644 .github/workflows/eval-tmpl.yml

diff --git a/.github/workflows/eval-gms8k.yml b/.github/workflows/eval-gms8k.yml
deleted file mode 100644
index 5a7e7e823..000000000
--- a/.github/workflows/eval-gms8k.yml
+++ /dev/null
@@ -1,64 +0,0 @@
-name: Eval - GSM8K (PoC)
-
-on:
-  workflow_dispatch:
-    inputs:
-      exp-name:
-        description: "Experiment name (prefix selects docker script)"
-        required: false
-        type: string
-        default: "gptoss_gsm8k_poc"
-      image:
-        description: "Serving image"
-        required: false
-        type: string
-        default: "vllm/vllm-openai:v0.11.0"
-      model:
-        description: "Model"
-        required: false
-        type: string
-        default: "openai/gpt-oss-120b"
-      tp:
-        description: "Tensor Parallel Size"
-        required: false
-        type: string
-        default: "2"
-      port:
-        description: "Server port"
-        required: false
-        type: string
-        default: "8888"
-      num_fewshot:
-        description: "Fewshot k for GSM8K"
-        required: false
-        type: string
-        default: "5"
-      limit:
-        description: "Sample limit for GSM8K"
-        required: false
-        type: string
-        default: "1300"
-  push:
-    paths:
-      - '.github/workflows/eval-gms8k.yml'
-      - '.github/workflows/eval-tmpl.yml' 
-      - 'benchmarks/benchmark_lib.sh'
-
-jobs:
-  eval:
-    uses: ./.github/workflows/eval-tmpl.yml
-    secrets: inherit
-    with:
-      runner: h100-cw_0
-      image: ${{ inputs.image || 'vllm/vllm-openai:v0.11.0' }}
-      model: ${{ inputs.model || 'openai/gpt-oss-120b' }}
-      framework: vllm
-      precision: fp4
-      exp-name: ${{ inputs.exp-name || 'gptoss_gsm8k_poc' }}
-      tp: '4'
-      ep: '1'
-      dp-attn: false
-      port: ${{ inputs.port || '8888' }}
-      eval-task: gsm8k
-      num-fewshot: ${{ inputs.num_fewshot || '5' }}
-      limit: ${{ inputs.limit || '200' }} 
diff --git a/.github/workflows/eval-tmpl.yml b/.github/workflows/eval-tmpl.yml
deleted file mode 100644
index e4e65a581..000000000
--- a/.github/workflows/eval-tmpl.yml
+++ /dev/null
@@ -1,152 +0,0 @@
-name: Template - Eval
-
-on:
-  workflow_call:
-    inputs:
-      runner:
-        required: true
-        type: string
-      image:
-        required: true
-        type: string
-      model:
-        required: true
-        type: string
-      framework:
-        required: true
-        type: string
-      precision:
-        required: true
-        type: string
-      exp-name:
-        required: true
-        type: string
-      tp:
-        required: true
-        type: string
-      ep:
-        required: false
-        type: string
-        default: '1'
-      dp-attn:
-        required: false
-        type: boolean
-        default: false
-      port:
-        required: false
-        type: string
-        default: '8888'
-      eval-task:
-        required: true
-        type: string
-      num-fewshot:
-        required: false
-        type: string
-        default: '5'
-      limit:
-        required: false
-        type: string
-        default: '200'
-
-env:
-  HF_TOKEN: ${{ secrets.HF_TOKEN }}
-  HF_HUB_CACHE: '/mnt/hf_hub_cache/'
-  EXP_NAME: ${{ inputs.exp-name }}
-  MODEL: ${{ inputs.model }}
-  IMAGE: ${{ inputs.image }}
-  FRAMEWORK: ${{ inputs.framework }}
-  PRECISION: ${{ inputs.precision }}
-  TP: ${{ inputs.tp }}
-  EP_SIZE: ${{ inputs.ep }}
-  DP_ATTENTION: ${{ inputs.dp-attn }}
-  PORT: ${{ inputs.port }}
-  EVAL_TASK: ${{ inputs['eval-task'] }}
-  NUM_FEWSHOT: ${{ inputs['num-fewshot'] }}
-  LIMIT: ${{ inputs.limit }}
-  # Keep eval outputs only under /tmp
-  EVAL_RESULT_DIR: /tmp/eval_out
-  CONC: '32'
-  MAX_MODEL_LEN: '4096'
-  ISL: 1024
-  OSL: 1024
-  RANDOM_RANGE_RATIO: '0.8'
-  RESULT_FILENAME: results
-  RUN_EVAL: true
-  
-jobs:
-  eval:
-    runs-on: ${{ inputs.runner }}
-    timeout-minutes: 180
-    name: "Eval ${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.precision }} tp=${{ inputs.tp }} task=${{ inputs['eval-task'] }} limit=${{ inputs.limit }}"
-    steps:
-      - name: Resource cleanup
-        run: |
-          sudo rm -rf /home/nvadmin/actions-runner/_work/InferenceMAX/InferenceMAX/eval_out/
-          # Helper to avoid indefinite hangs on flaky tools (Docker/Slurm)
-          safe_timeout() {
-            if command -v timeout >/dev/null 2>&1; then
-              timeout -k 5 30s "$@"
-            else
-              "$@"
-            fi
-          }
-          host=$(hostname)
-          if [[ "$host" == "b200-81" || "$host" == "b200-80" || "$host" == "b200-79" ]]; then
-            if command -v docker >/dev/null 2>&1; then
-              echo "[INFO] Running container-by-container cleanup on $host"
-              cids=$(safe_timeout docker ps -aq || true)
-              for cid in $cids; do
-                echo "[INFO] Cleaning container $cid"
-                safe_timeout docker stop -t 90 "$cid" || true
-                safe_timeout docker wait "$cid" >/dev/null 2>&1 || true
-                safe_timeout docker rm -f "$cid" >/dev/null 2>&1 || true
-              done
-              sleep 2
-              if nvidia-smi --query-compute-apps=pid --format=csv,noheader | grep -q '[0-9]'; then
-                echo "[WARN] After stop, GPU still busy:"
-                nvidia-smi || true
-              fi
-            else
-              echo "[Docker] docker client not found; skipping cleanup"
-            fi
-          else
-            echo "[Docker] skipping docker cleanup on host $host"
-          fi
-          # Best-effort cleanup of prior eval outputs; do not block
-        
-          if command -v squeue >/dev/null 2>&1; then
-            echo "[Slurm] Cleaning up resources ..."
-            safe_timeout scancel -u "$USER" || true
-            # Wait up to 5 minutes for jobs to clear to avoid indefinite hang
-            end=$((SECONDS + 300))
-            while [ $SECONDS -lt $end ]; do
-              queued=$(safe_timeout squeue -u "$USER" --noheader --format='%i' 2>/dev/null || true)
-              if [ -z "$queued" ]; then
-                break
-              fi
-              echo "$queued" | sed 's/^/[Slurm] pending job: /' || true
-              sleep 5
-            done
-            # Final status; do not block
-            safe_timeout squeue -u "$USER" || true
-            if [ -n "$(safe_timeout squeue -u "$USER" --noheader --format='%i' 2>/dev/null || true)" ]; then
-              echo "[Slurm] Jobs still present after timeout; proceeding"
-            fi
-          fi
-
-      - uses: actions/checkout@v5
-        with:
-          fetch-depth: 0
-          # Avoid aggressive workspace deletion if stale, rely on git reset/clean later
-          clean: true
-
-      - name: Launch eval via runner script
-        env:
-          RUNNER_NAME: ${{ runner.name }}
-          RUN_MODE: eval
-          # Optional: structured filename if runner chooses to use it later
-          EVAL_RESULT_BASENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_${{ runner.name }}
-        run: |
-          bash ./runners/launch_${RUNNER_NAME%%_*}.sh
-
-      # Intentionally no eval artifact uploads: eval outputs remain in /tmp only.
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 2e3ad10c4..99b56e20e 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -322,6 +322,7 @@ append_lm_eval_summary() {
   "framework": "${fw:-unknown}",
   "precision": "${prec:-unknown}",
   "tp": ${TP:-1},
+  "conc": ${CONC:-1},
   "ep": ${EP_SIZE:-1},
   "dp_attention": ${dp_json},
   "model": "${model_name:-}",
diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py
index 0c188473f..4f6f0dd30 100644
--- a/utils/collect_eval_results.py
+++ b/utils/collect_eval_results.py
@@ -300,6 +300,7 @@ def main():
             'precision': (meta.get('precision') or 'unknown').lower(),
             'tp': int(meta.get('tp') or 1),
             'ep': int(meta.get('ep') or 1),
+            'conc': int(meta.get('conc') or 0),
             'dp_attention': str(meta.get('dp_attention') or 'false'),
             'task': m.get('task') or 'unknown',
             'em_strict': m.get('strict'),
@@ -312,14 +313,17 @@ def main():
         rows.append(row)
 
     # Sort for stable output
-    rows.sort(key=lambda r: (r.get('model',''), r.get('hw',''), r.get('framework',''), r.get('precision',''), r.get('tp',0), r.get('ep',0)))
+    rows.sort(key=lambda r: (
+        r.get('hw',''), r.get('framework',''),
+        r.get('precision',''), r.get('tp',0), r.get('conc',0)
+    ))
 
     if not rows:
         print('> No eval results found to summarize.')
     else:
         # Print Markdown summary table
-        print('| Model | Hardware | Framework | Precision | TP | EP | DPA | Task | EM Strict | EM Flexible | N (eff) |')
-        print('| :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |')
+        print('| Model | Hardware | Framework | Precision | TP | EP | Conc | DPA | Task | EM Strict | EM Flexible | N (eff) |')
+        print('| :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |')
         for r in rows:
             print(
                 f"| {r['model']} "
@@ -328,6 +332,7 @@ def main():
                 f"| {r['precision'].upper()} "
                 f"| {r['tp']} "
                 f"| {r['ep']} "
+                f"| {r['conc']} "
                 f"| {r['dp_attention']} "
                 f"| {r['task']} "
                 f"| {pct(r['em_strict'])}{se(r['em_strict_se'])} "

From 1d889b8d75d352b723b38049a3a095dca4385bf7 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 6 Dec 2025 11:25:50 +0800
Subject: [PATCH 175/214] Cleanup comments, ammend lighteval

---
 benchmarks/benchmark_lib.sh         | 5 ++---
 benchmarks/gptoss_fp4_h200_slurm.sh | 2 --
 utils/evals/custom_gsm8k.py         | 2 +-
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 807d6b8eb..18438e8b9 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -160,7 +160,6 @@ run_benchmark_serving() {
 
 _install_lm_eval_deps() {
     python3 -m pip install -q --no-cache-dir "lm-eval[api]" || true
-    # Temporary: workaround issue by using main
     python3 -m pip install -q --no-cache-dir --no-deps \
         "git+https://github.com/EleutherAI/lm-evaluation-harness.git@b315ef3b05176acc9732bb7fdec116abe1ecc476" || true
 }
@@ -356,7 +355,7 @@ META
 # ------------------------------
 
 _install_lighteval_deps() {
-    python3 -m pip install -q --no-cache-dir "lighteval[api]" "litellm" || true
+    python3 -m pip install -q --no-cache-dir "lighteval==0.13.0" "litellm==1.80.7" || true
 }
 
 # Patch lighteval's LiteLLMClient to handle reasoning content and Python name mangling
@@ -615,7 +614,7 @@ run_lighteval_eval() {
     local base_url="http://0.0.0.0:${port}/v1"
     export OPENAI_API_KEY="${OPENAI_API_KEY:-EMPTY}"
 
-    local MODEL_ARGS="model_name=${lite_model},base_url=${base_url},api_key=${OPENAI_API_KEY},generation_parameters={temperature:0.0,top_p=1,max_new_tokens:2048},concurrent_requests=${concurrent_requests}"
+    local MODEL_ARGS="model_name=${lite_model},base_url=${base_url},api_key=${OPENAI_API_KEY},generation_parameters={temperature:0.0,top_p:1,max_new_tokens:2048},concurrent_requests=${concurrent_requests}"
     local TASK_SPEC="${task}|${num_fewshot}"
 
     # Respect absolute paths (e.g., /tmp/eval_out); otherwise write under /workspace
diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh
index ac19e6da8..b379e91e5 100644
--- a/benchmarks/gptoss_fp4_h200_slurm.sh
+++ b/benchmarks/gptoss_fp4_h200_slurm.sh
@@ -39,8 +39,6 @@ export TORCH_CUDA_ARCH_LIST="9.0"
 PORT=$(( 8888 + $PORT_OFFSET ))
 MODEL_NAME=${MODEL##*/}
 
-export TORCH_CUDA_ARCH_LIST="9.0"
-
 PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT \
  --config config.yaml \
  --gpu-memory-utilization 0.9 \
diff --git a/utils/evals/custom_gsm8k.py b/utils/evals/custom_gsm8k.py
index 5445f5732..ac6c0b9be 100644
--- a/utils/evals/custom_gsm8k.py
+++ b/utils/evals/custom_gsm8k.py
@@ -13,7 +13,7 @@
     evaluation_splits=["test"],
     few_shots_split=None,
     few_shots_select="random_sampling_from_train",
-    generation_size=768,         # raised this from 256
+    generation_size=1024,         # raised this from 256
     metrics=[Metrics.expr_gold_metric],
     stop_sequence=None,           # avoid early stop on "Question:"
     version=0,

From 779a25793ca900e98dab3480be00874fb7b5ac8e Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 15 Dec 2025 11:48:18 -0600
Subject: [PATCH 176/214] pt 1 manual merge conflict fixes

---
 utils/evals/{READMEevals.md => EVALS.md}      |    0
 utils/matrix-logic/generate_sweep_configs.py  | 1036 -----------
 utils/matrix-logic/get_test_sweep_configs.py  |  151 --
 .../test_generate_sweep_configs.py            | 1656 -----------------
 utils/matrix_logic/generate_sweep_configs.py  |  748 ++++++++
 .../{matrix-logic => matrix_logic}/pytest.ini |    0
 .../test_generate_sweep_configs.py            |  948 ++++++++++
 utils/matrix_logic/test_validation.py         |  869 +++++++++
 utils/matrix_logic/validation.py              |  438 +++++
 9 files changed, 3003 insertions(+), 2843 deletions(-)
 rename utils/evals/{READMEevals.md => EVALS.md} (100%)
 delete mode 100644 utils/matrix-logic/generate_sweep_configs.py
 delete mode 100644 utils/matrix-logic/get_test_sweep_configs.py
 delete mode 100644 utils/matrix-logic/test_generate_sweep_configs.py
 create mode 100644 utils/matrix_logic/generate_sweep_configs.py
 rename utils/{matrix-logic => matrix_logic}/pytest.ini (100%)
 create mode 100644 utils/matrix_logic/test_generate_sweep_configs.py
 create mode 100644 utils/matrix_logic/test_validation.py
 create mode 100644 utils/matrix_logic/validation.py

diff --git a/utils/evals/READMEevals.md b/utils/evals/EVALS.md
similarity index 100%
rename from utils/evals/READMEevals.md
rename to utils/evals/EVALS.md
diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py
deleted file mode 100644
index a039a52f1..000000000
--- a/utils/matrix-logic/generate_sweep_configs.py
+++ /dev/null
@@ -1,1036 +0,0 @@
-import json
-import yaml
-import argparse
-from pydantic import BaseModel, Field, ValidationError, ConfigDict
-from typing import List
-
-# Field name constants
-# Top-level config fields
-FIELD_IMAGE = 'image'
-FIELD_MODEL = 'model'
-FIELD_MODEL_PREFIX = 'model-prefix'
-FIELD_PRECISION = 'precision'
-FIELD_FRAMEWORK = 'framework'
-FIELD_RUNNER = 'runner'
-FIELD_SEQ_LEN_CONFIGS = 'seq-len-configs'
-
-# Seq-len-config fields
-FIELD_ISL = 'isl'
-FIELD_OSL = 'osl'
-FIELD_SEARCH_SPACE = 'search-space'
-
-# Search-space/benchmark fields
-FIELD_TP = 'tp'
-FIELD_CONC_START = 'conc-start'
-FIELD_CONC_END = 'conc-end'
-FIELD_EP = 'ep'
-FIELD_DP_ATTN = 'dp-attn'
-
-# Matrix entry fields
-FIELD_CONC = 'conc'
-FIELD_MAX_MODEL_LEN = 'max-model-len'
-FIELD_EXP_NAME = 'exp-name'
-
-# Eval
-FIELD_RUN_EVAL = 'run-eval'
-
-seq_len_stoi = {
-    "1k1k": (1024, 1024),
-    "1k8k": (1024, 8192),
-    "8k1k": (8192, 1024)
-}
-
-# Reverse mapping for exp-name generation
-seq_len_itos = {v: k for k, v in seq_len_stoi.items()}
-
-
-def seq_len_to_str(isl: int, osl: int) -> str:
-    """Convert sequence lengths to short string representation.
-
-    Returns the short name (e.g., '1k1k') if it exists in the mapping,
-    otherwise returns 'isl_osl' format.
-    """
-    return seq_len_itos.get((isl, osl), f"{isl}_{osl}")
-
-
-class MatrixEntry(BaseModel):
-    """Pydantic model for validating matrix entry structure."""
-    model_config = ConfigDict(extra='forbid', populate_by_name=True)
-
-    image: str
-    model: str
-    precision: str
-    framework: str
-    runner: str
-    isl: int
-    osl: int
-    tp: int
-    ep: int
-    dp_attn: bool = Field(alias='dp-attn')
-    conc: int
-    max_model_len: int = Field(alias='max-model-len')
-    exp_name: str = Field(alias='exp-name')
-    run_eval: bool = Field(alias='run-eval', default=False)
-
-
-def validate_matrix_output(matrix_values: List[dict]) -> List[dict]:
-    """Validate that matrix_values entries match the expected structure.
-
-    Raises ValueError if any entry fails validation.
-    Returns the original list if all entries are valid.
-    """
-    for i, entry in enumerate(matrix_values):
-        try:
-            MatrixEntry(**entry)
-        except ValidationError as e:
-            raise ValueError(f"Matrix entry at index {i} failed validation:\n{e}")
-    return matrix_values
-
-def mark_eval_entries(matrix_values: List[dict]) -> List[dict]:
-    """Mark entries that should run evaluation.
-    
-    For each unique (model, runner, framework, precision, isl, osl) combination:
-    - Mark highest TP with highest conc
-    - Mark lowest TP with highest conc
-    """
-    from collections import defaultdict
-    
-    # Group entries by (model, runner, framework, precision, isl, osl)
-    # This ensures we compare within the same configuration, not across different frameworks
-    groups = defaultdict(list)
-    for i, entry in enumerate(matrix_values):
-        key = (
-            entry[FIELD_MODEL], 
-            entry[FIELD_RUNNER], 
-            entry[FIELD_FRAMEWORK],
-            entry[FIELD_PRECISION],
-            entry[FIELD_ISL], 
-            entry[FIELD_OSL]
-        )
-        groups[key].append((i, entry))
-    
-    # For each group, find highest TP/highest conc and lowest TP/highest conc
-    eval_indices = set()
-    for key, entries in groups.items():
-        if not entries:
-            continue
-        
-        # Find min and max TP values
-        min_tp = min(e[FIELD_TP] for _, e in entries)
-        max_tp = max(e[FIELD_TP] for _, e in entries)
-        
-        # Find highest conc for highest TP
-        highest_tp_entries = [(i, e) for i, e in entries if e[FIELD_TP] == max_tp]
-        if highest_tp_entries:
-            max_conc_highest_tp = max(e[FIELD_CONC] for _, e in highest_tp_entries)
-            for i, e in highest_tp_entries:
-                if e[FIELD_CONC] == max_conc_highest_tp:
-                    eval_indices.add(i)
-        
-        # Find highest conc for lowest TP (only if different from max_tp)
-        if min_tp != max_tp:
-            lowest_tp_entries = [(i, e) for i, e in entries if e[FIELD_TP] == min_tp]
-            if lowest_tp_entries:
-                max_conc_lowest_tp = max(e[FIELD_CONC] for _, e in lowest_tp_entries)
-                for i, e in lowest_tp_entries:
-                    if e[FIELD_CONC] == max_conc_lowest_tp:
-                        eval_indices.add(i)
-    
-    # Mark the selected entries
-    for i, entry in enumerate(matrix_values):
-        entry[FIELD_RUN_EVAL] = i in eval_indices
-    
-    return matrix_values
-
-def validate_master_configs_structure(all_config_data):
-    """Validate the structure of all master config entries.
-
-    This validates that all required fields are present, have correct types,
-    and no extra fields exist. Should be called once after loading config files.
-    """
-    for key, val in all_config_data.items():
-        # Check for required top-level fields and their types
-        required_fields = {
-            FIELD_IMAGE: str,
-            FIELD_MODEL: str,
-            FIELD_MODEL_PREFIX: str,
-            FIELD_PRECISION: str,
-            FIELD_FRAMEWORK: str,
-            FIELD_RUNNER: str,
-            FIELD_SEQ_LEN_CONFIGS: list
-        }
-
-        for field, expected_type in required_fields.items():
-            if field not in val or val[field] is None:
-                raise ValueError(
-                    f"Missing required field '{field}' for key '{key}'")
-            if not isinstance(val[field], expected_type):
-                raise ValueError(
-                    f"Field '{field}' must be {expected_type.__name__} for key '{key}', got {type(val[field]).__name__}")
-
-        seq_len_configs = val[FIELD_SEQ_LEN_CONFIGS]
-        if len(seq_len_configs) == 0:
-            raise ValueError(
-                f"'{FIELD_SEQ_LEN_CONFIGS}' must be a non-empty list for key '{key}'")
-
-        # Validate each seq-len-config
-        for i, seq_config in enumerate(seq_len_configs):
-            # Check isl
-            if FIELD_ISL not in seq_config or seq_config[FIELD_ISL] is None:
-                raise ValueError(
-                    f"Missing '{FIELD_ISL}' in seq-len-config[{i}] for key '{key}'")
-            if not isinstance(seq_config[FIELD_ISL], int):
-                raise ValueError(
-                    f"'{FIELD_ISL}' must be int in seq-len-config[{i}] for key '{key}'")
-
-            # Check osl
-            if FIELD_OSL not in seq_config or seq_config[FIELD_OSL] is None:
-                raise ValueError(
-                    f"Missing '{FIELD_OSL}' in seq-len-config[{i}] for key '{key}'")
-            if not isinstance(seq_config[FIELD_OSL], int):
-                raise ValueError(
-                    f"'{FIELD_OSL}' must be int in seq-len-config[{i}] for key '{key}'")
-
-            bmk_space = seq_config.get(FIELD_SEARCH_SPACE)
-            if not bmk_space or not isinstance(bmk_space, list) or len(bmk_space) == 0:
-                raise ValueError(
-                    f"Missing or invalid '{FIELD_SEARCH_SPACE}' in seq-len-config[{i}] for key '{key}'")
-
-            # Validate each benchmark in search-space
-            for j, bmk in enumerate(bmk_space):
-                # Define allowed fields
-                allowed_fields = {FIELD_TP, FIELD_CONC_START,
-                                  FIELD_CONC_END, FIELD_EP, FIELD_DP_ATTN}
-                required_bmk_fields = {FIELD_TP: int,
-                                       FIELD_CONC_START: int, FIELD_CONC_END: int}
-                optional_bmk_fields = {FIELD_EP: int, FIELD_DP_ATTN: bool}
-
-                # Check for extra fields
-                extra_fields = set(bmk.keys()) - allowed_fields
-                if extra_fields:
-                    raise ValueError(
-                        f"Extra fields {extra_fields} in search-space[{j}] of seq-len-config[{i}] for key '{key}'")
-
-                # Validate required fields
-                for field, expected_type in required_bmk_fields.items():
-                    if field not in bmk or bmk[field] is None:
-                        raise ValueError(
-                            f"Missing '{field}' in search-space[{j}] of seq-len-config[{i}] for key '{key}'")
-                    if not isinstance(bmk[field], expected_type):
-                        raise ValueError(
-                            f"'{field}' must be {expected_type.__name__} in search-space[{j}] of seq-len-config[{i}] for key '{key}'")
-
-                # Validate optional fields if they exist
-                for field, expected_type in optional_bmk_fields.items():
-                    if field in bmk and bmk[field] is not None:
-                        if not isinstance(bmk[field], expected_type):
-                            raise ValueError(
-                                f"'{field}' must be {expected_type.__name__} in search-space[{j}] of seq-len-config[{i}] for key '{key}'")
-
-
-def generate_full_sweep(args, all_config_data):
-    """Generate full sweep configurations with optional filtering.
-
-    Supports filtering by model prefix, precision, framework, runner type, and sequence lengths.
-    Supports test mode to only run highest TP with lowest concurrency.
-
-    All filters are optional - can generate sweeps for all configs or filter by specific criteria.
-
-    Assumes all_config_data has been validated by validate_config_structure().
-    """
-    # Validate runner types if specified
-    if args.runner_type:
-        if not args.runner_config:
-            raise ValueError(
-                "--runner-config is required when --runner-type is specified")
-
-        try:
-            with open(args.runner_config, 'r') as f:
-                runner_config = yaml.safe_load(f)
-        except FileNotFoundError:
-            raise ValueError(
-                f"Runner config file '{args.runner_config}' does not exist.")
-
-        valid_runner_types = set(runner_config.keys())
-        invalid_runners = set(args.runner_type) - valid_runner_types
-        if invalid_runners:
-            raise ValueError(
-                f"Invalid runner type(s): {invalid_runners}. "
-                f"Valid runner types are: {', '.join(sorted(valid_runner_types))}")
-
-    matrix_values = []
-
-    # Convert seq-lens to set of (isl, osl) tuples for filtering
-    seq_lens_filter = None
-    if args.seq_lens:
-        seq_lens_filter = {seq_len_stoi[sl] for sl in args.seq_lens}
-
-    for key, val in all_config_data.items():
-        # Filter by model prefix if specified
-        if args.model_prefix:
-            if not any(key.startswith(prefix) for prefix in args.model_prefix):
-                continue
-
-        # Filter by precision if specified
-        if args.precision and val[FIELD_PRECISION] not in args.precision:
-            continue
-
-        # Filter by framework if specified
-        if args.framework and val[FIELD_FRAMEWORK] not in args.framework:
-            continue
-
-        # Filter by runner type if specified
-        if args.runner_type and val[FIELD_RUNNER] not in args.runner_type:
-            continue
-
-        seq_len_configs = val[FIELD_SEQ_LEN_CONFIGS]
-        image = val[FIELD_IMAGE]
-        model = val[FIELD_MODEL]
-        precision = val[FIELD_PRECISION]
-        framework = val[FIELD_FRAMEWORK]
-        runner = val[FIELD_RUNNER]
-        model_code = val[FIELD_MODEL_PREFIX]
-
-        for seq_config in seq_len_configs:
-            isl = seq_config[FIELD_ISL]
-            osl = seq_config[FIELD_OSL]
-
-            # Filter by sequence lengths if specified
-            if seq_lens_filter and (isl, osl) not in seq_lens_filter:
-                continue
-
-            bmk_space = seq_config[FIELD_SEARCH_SPACE]
-
-            if args.test_mode:
-                # In test mode, use highest TP with lowest concurrency
-                highest_tp_bmk = max(bmk_space, key=lambda x: x[FIELD_TP])
-                tp = highest_tp_bmk[FIELD_TP]
-                conc = highest_tp_bmk[FIELD_CONC_START]
-                ep = highest_tp_bmk.get(FIELD_EP)
-                dp_attn = highest_tp_bmk.get(FIELD_DP_ATTN)
-
-                seq_len_str = seq_len_to_str(isl, osl)
-                entry = {
-                    FIELD_IMAGE: image,
-                    FIELD_MODEL: model,
-                    FIELD_PRECISION: precision,
-                    FIELD_FRAMEWORK: framework,
-                    FIELD_RUNNER: runner,
-                    FIELD_ISL: isl,
-                    FIELD_OSL: osl,
-                    FIELD_TP: tp,
-                    FIELD_EP: 1,  # Default
-                    FIELD_DP_ATTN: False,  # Default
-                    FIELD_CONC: conc,
-                    FIELD_MAX_MODEL_LEN: isl + osl + 200,
-                    FIELD_EXP_NAME: f"{model_code}_{seq_len_str}",
-                }
-
-                if ep is not None:
-                    entry[FIELD_EP] = ep
-                if dp_attn is not None:
-                    entry[FIELD_DP_ATTN] = dp_attn
-
-                matrix_values.append(entry)
-            else:
-                # Full sweep mode
-                for bmk in bmk_space:
-                    tp = bmk[FIELD_TP]
-                    conc_start = bmk[FIELD_CONC_START]
-                    conc_end = bmk[FIELD_CONC_END]
-                    ep = bmk.get(FIELD_EP)
-                    dp_attn = bmk.get(FIELD_DP_ATTN)
-
-                    conc = conc_start
-                    while conc <= conc_end:
-                        seq_len_str = seq_len_to_str(isl, osl)
-                        entry = {
-                            FIELD_IMAGE: image,
-                            FIELD_MODEL: model,
-                            FIELD_PRECISION: precision,
-                            FIELD_FRAMEWORK: framework,
-                            FIELD_RUNNER: runner,
-                            FIELD_ISL: isl,
-                            FIELD_OSL: osl,
-                            FIELD_TP: tp,
-                            FIELD_CONC: conc,
-                            FIELD_MAX_MODEL_LEN: isl + osl + 200,
-                            FIELD_EP: 1,  # Default
-                            FIELD_DP_ATTN: False,  # Default
-                            FIELD_EXP_NAME: f"{model_code}_{seq_len_str}",
-                        }
-
-                        if ep is not None:
-                            entry[FIELD_EP] = ep
-                        if dp_attn is not None:
-                            entry[FIELD_DP_ATTN] = dp_attn
-
-                        matrix_values.append(entry)
-
-                        if conc == conc_end:
-                            break
-                        conc *= args.step_size
-                        if conc > conc_end:
-                            conc = conc_end
-
-    if len(matrix_values) == 0:
-        error_msg = "No configs found matching filters:"
-        if args.model_prefix:
-            error_msg += f" model-prefix={args.model_prefix}"
-        if args.precision:
-            error_msg += f" precision={args.precision}"
-        if args.framework:
-            error_msg += f" framework={args.framework}"
-        if args.runner_type:
-            error_msg += f" runner-type={args.runner_type}"
-        if seq_lens_filter:
-            error_msg += f" seq-lens={args.seq_lens}"
-        raise ValueError(error_msg)
-
-    return matrix_values
-
-
-def generate_test_config(args, all_config_data):
-    """Generate test configurations for a specific key.
-
-    Assumes all_config_data has been validated by validate_config_structure().
-    """
-    try:
-        with open(args.runner_config, 'r') as f:
-            runner_config = yaml.safe_load(f)
-    except FileNotFoundError as e:
-        raise ValueError(
-            f"Runner config file '{args.runner_config}' does not exist.")
-
-    val = all_config_data.get(args.key)
-
-    if not val:
-        raise ValueError(
-            f"Specified key '{args.key}' does not exist in config files.")
-
-    # Extract model code from config
-    model_code = val[FIELD_MODEL_PREFIX]
-
-    runner_nodes = runner_config.get(val[FIELD_RUNNER], [])
-    if args.runner_node and args.runner_node not in runner_nodes:
-        raise ValueError(
-            f"Runner node '{args.runner_node}' is not compatible with config '{args.key}' which runs on runner type '{val[FIELD_RUNNER]}'. Available runner nodes for this config are '{', '.join(runner_nodes)}'.")
-
-    seq_len_configs = val[FIELD_SEQ_LEN_CONFIGS]
-    image = val[FIELD_IMAGE]
-    model = val[FIELD_MODEL]
-    precision = val[FIELD_PRECISION]
-    framework = val[FIELD_FRAMEWORK]
-    # Use default runner or specific runner node if input by user
-    runner = val[FIELD_RUNNER] if not args.runner_node else args.runner_node
-
-    # Convert seq-lens to set of (isl, osl) tuples for filtering
-    seq_lens_filter = None
-    if args.seq_lens:
-        seq_lens_filter = {seq_len_stoi[sl] for sl in args.seq_lens}
-
-    matrix_values = []
-
-    # Process each sequence length configuration
-    for seq_config in seq_len_configs:
-        isl = seq_config[FIELD_ISL]
-        osl = seq_config[FIELD_OSL]
-
-        # Filter by sequence lengths if specified
-        if seq_lens_filter and (isl, osl) not in seq_lens_filter:
-            continue
-
-        bmk_space = seq_config[FIELD_SEARCH_SPACE]
-
-        for bmk in bmk_space:
-            tp = bmk[FIELD_TP]
-            conc_start = bmk[FIELD_CONC_START]
-            conc_end = bmk[FIELD_CONC_END]
-            ep = bmk.get(FIELD_EP)
-            dp_attn = bmk.get(FIELD_DP_ATTN)
-
-            # In test mode, only use the lowest concurrency (conc_start)
-            if args.test_mode:
-                entry = {
-                    FIELD_IMAGE: image,
-                    FIELD_MODEL: model,
-                    FIELD_PRECISION: precision,
-                    FIELD_FRAMEWORK: framework,
-                    FIELD_RUNNER: runner,
-                    FIELD_ISL: isl,
-                    FIELD_OSL: osl,
-                    FIELD_TP: tp,
-                    FIELD_EP: 1, # Default,
-                    FIELD_DP_ATTN: False, # Default
-                    FIELD_CONC: conc_start,
-                    FIELD_MAX_MODEL_LEN: isl + osl,
-                    FIELD_EXP_NAME: f"{model_code}_test",
-                }
-
-                # Add optional fields if they exist
-                if ep is not None:
-                    entry[FIELD_EP] = ep
-                if dp_attn is not None:
-                    entry[FIELD_DP_ATTN] = dp_attn
-
-                matrix_values.append(entry)
-            else:
-                # Generate entries for each concurrency value in the range
-                conc = conc_start
-                while conc <= conc_end:
-                    seq_len_str = seq_len_to_str(isl, osl)
-                    entry = {
-                        FIELD_IMAGE: image,
-                        FIELD_MODEL: model,
-                        FIELD_PRECISION: precision,
-                        FIELD_FRAMEWORK: framework,
-                        FIELD_RUNNER: runner,
-                        FIELD_ISL: isl,
-                        FIELD_OSL: osl,
-                        FIELD_TP: tp,
-                        FIELD_EP: 1, # Default,
-                        FIELD_DP_ATTN: False, # Default
-                        FIELD_CONC: conc,
-                        FIELD_MAX_MODEL_LEN: isl + osl,
-                        FIELD_EXP_NAME: f"{model_code}_{seq_len_str}",
-                    }
-
-                    # Add optional fields if they exist
-                    if ep is not None:
-                        entry[FIELD_EP] = ep
-                    if dp_attn is not None:
-                        entry[FIELD_DP_ATTN] = dp_attn
-
-                    matrix_values.append(entry)
-
-                    if conc == conc_end:
-                        break
-                    conc *= args.step_size
-                    if conc > conc_end:
-                        conc = conc_end
-
-    return matrix_values
-
-
-def generate_runner_model_sweep_config(args, all_config_data):
-    """Generate runner-model sweep configurations.
-
-    Assumes all_config_data has been validated by validate_config_structure().
-    """
-    try:
-        with open(args.runner_config, 'r') as f:
-            runner_config = yaml.safe_load(f)
-    except FileNotFoundError as e:
-        raise ValueError(
-            f"Runner config file '{args.runner_config}' does not exist.")
-
-    runner_nodes = runner_config.get(args.runner_type)
-
-    if not runner_nodes:
-        raise ValueError(
-            f"Runner '{args.runner_type}' does not exist in runner config '{args.runner_config}'. Must choose from existing runner types: '{', '.join(runner_config.keys())}'.")
-
-    # Filter runner nodes if filter is specified
-    if args.runner_node_filter:
-        runner_nodes = [node for node in runner_nodes if args.runner_node_filter in node]
-        if not runner_nodes:
-            raise ValueError(
-                f"No runner nodes found matching filter '{args.runner_node_filter}' for runner type '{args.runner_type}'.")
-
-    matrix_values = []
-    for key, val in all_config_data.items():
-        # Only consider configs with specified runner
-        if val[FIELD_RUNNER] != args.runner_type:
-            continue
-
-        # Get model code for exp_name
-        model_code = val[FIELD_MODEL_PREFIX]
-
-        # Find 1k1k config
-        target_config = None
-        for config in val[FIELD_SEQ_LEN_CONFIGS]:
-            if config[FIELD_ISL] == 1024 and config[FIELD_OSL] == 1024:
-                target_config = config
-                break
-
-        highest_tp_bmk = max(target_config[FIELD_SEARCH_SPACE], key=lambda x: x[FIELD_TP])
-        # Since we are just testing, pick the highest TP for this config and just test
-        # on that TP with the lowest concurrency available
-        highest_tp = highest_tp_bmk[FIELD_TP]
-        lowest_conc = highest_tp_bmk[FIELD_CONC_START]
-
-        ep = highest_tp_bmk.get(FIELD_EP)
-        dp_attn = highest_tp_bmk.get(FIELD_DP_ATTN)
-
-        for node in runner_nodes:
-            entry = {
-                FIELD_IMAGE: val[FIELD_IMAGE],
-                FIELD_MODEL: val[FIELD_MODEL],
-                FIELD_PRECISION: val[FIELD_PRECISION],
-                FIELD_FRAMEWORK: val[FIELD_FRAMEWORK],
-                # Add one entry for each node under specified runner type
-                FIELD_RUNNER: node,
-                # Again, just use 1k1k since this is just meant to smoke test all runners
-                FIELD_ISL: 1024,
-                FIELD_OSL: 1024,
-                FIELD_TP: highest_tp,
-                FIELD_EP: 1, # Default,
-                FIELD_DP_ATTN: False, # Default
-                FIELD_CONC: lowest_conc,
-                FIELD_MAX_MODEL_LEN: 2048,
-                FIELD_EXP_NAME: f"{model_code}_test",
-            }
-
-            # Add optional fields if they exist
-            if ep is not None:
-                entry[FIELD_EP] = ep
-            if dp_attn is not None:
-                entry[FIELD_DP_ATTN] = dp_attn
-
-            matrix_values.append(entry)
-
-    return matrix_values
-
-
-def generate_custom_test(args):
-    """Generate single 1k1k job for custom inputs.
-    """
-    try:
-        with open(args.runner_config, 'r') as f:
-            runner_config = yaml.safe_load(f)
-    except FileNotFoundError as e:
-        raise ValueError(
-            f"Runner config file '{args.runner_config}' does not exist.")
-    
-    found_runner_label = False
-    for runner_type, runner_nodes in runner_config.items():
-        if args.runner_label == runner_type or args.runner_label in runner_nodes:
-            found_runner_label = True
-    
-    if not found_runner_label:
-        raise ValueError(f"Unable to find specified runner label '{args.runner_label}'.")
-
-    if not runner_nodes:
-        raise ValueError(
-            f"Runner '{args.runner_type}' does not exist in runner config '{args.runner_config}'. Must choose from existing runner types: '{', '.join(runner_config.keys())}'.")
-
-    return [
-        {
-            FIELD_IMAGE: args.image,
-            FIELD_MODEL: args.model,
-            FIELD_PRECISION: args.precision,
-            FIELD_FRAMEWORK: args.framework,
-            FIELD_RUNNER: args.runner_label,
-            # Again, just use 1k1k since this is just meant to smoke test all runners
-            FIELD_ISL: 1024,
-            FIELD_OSL: 1024,
-            FIELD_TP: 8,
-            FIELD_EP: 1,
-            FIELD_DP_ATTN: False,
-            FIELD_CONC: 4,
-            FIELD_EXP_NAME: args.exp_name,
-            FIELD_MAX_MODEL_LEN: 2048,
-        }
-    ]
-
-
-def generate_runner_sweep_config(args, all_config_data):
-    """Generate runner sweep configurations.
-
-    Assumes all_config_data has been validated by validate_config_structure().
-    """
-    try:
-        with open(args.runner_config, 'r') as f:
-            runner_config = yaml.safe_load(f)
-    except FileNotFoundError as e:
-        raise ValueError(
-            f"Runner config file '{args.runner_config}' does not exist.")
-
-    if not runner_config.get(args.runner_type):
-        raise ValueError(
-            f"Runner '{args.runner_type}' does not exist in runner config '{args.runner_config}'. Must choose from existing runner types: '{', '.join(runner_config.keys())}'.")
-
-
-    matrix_values = []
-    for key, val in all_config_data.items():
-        # Only consider configs with specified runner
-        if not key.startswith(args.model_prefix):
-            continue
-
-        if not val[FIELD_RUNNER] == args.runner_type:
-            continue
-
-        # Optionally filter by precision and framework
-        if (args.precision and val[FIELD_PRECISION] != args.precision) or (args.framework and val[FIELD_FRAMEWORK] != args.framework):
-            continue
-
-        # Get model code for exp_name
-        model_code = val[FIELD_MODEL_PREFIX]
-
-        runner_nodes = runner_config.get(val[FIELD_RUNNER])
-        if not runner_nodes:
-            raise ValueError(
-                f"Runner '{val[FIELD_RUNNER]}' does not exist in runner config '{args.runner_config}'. Must choose from existing runner types: '{', '.join(runner_config.keys())}'.")
-
-        # Find 1k1k config
-        target_config = None
-        for config in val[FIELD_SEQ_LEN_CONFIGS]:
-            if config[FIELD_ISL] == 1024 and config[FIELD_OSL] == 1024:
-                target_config = config
-                break
-
-        highest_tp_bmk = max(target_config[FIELD_SEARCH_SPACE], key=lambda x: x[FIELD_TP])
-        # Since we are just testing, pick the highest TP for this config and just test
-        # on that TP with the lowest concurrency available
-        highest_tp = highest_tp_bmk[FIELD_TP]
-        lowest_conc = highest_tp_bmk[FIELD_CONC_START]
-
-        ep = highest_tp_bmk.get(FIELD_EP)
-        dp_attn = highest_tp_bmk.get(FIELD_DP_ATTN)
-
-        for node in runner_nodes:
-            entry = {
-                FIELD_IMAGE: val[FIELD_IMAGE],
-                FIELD_MODEL: val[FIELD_MODEL],
-                FIELD_PRECISION: val[FIELD_PRECISION],
-                FIELD_FRAMEWORK: val[FIELD_FRAMEWORK],
-                # Add one entry for each node under specified runner type
-                FIELD_RUNNER: node,
-                # Again, just use 1k1k since this is just meant to smoke test all runners
-                FIELD_ISL: 1024,
-                FIELD_OSL: 1024,
-                FIELD_TP: highest_tp,
-                FIELD_EP: 1, # Default,
-                FIELD_DP_ATTN: False, # Default
-                FIELD_CONC: lowest_conc,
-                FIELD_EXP_NAME: f"{model_code}_test",
-                FIELD_MAX_MODEL_LEN: 2048,
-            }
-
-            # Add optional fields if they exist
-            if ep is not None:
-                entry[FIELD_EP] = ep
-            if dp_attn is not None:
-                entry[FIELD_DP_ATTN] = dp_attn
-
-            matrix_values.append(entry)
-
-    if len(matrix_values) == 0:
-        error_msg = f"No configs found matching model prefix '{args.model_prefix}'"
-        if args.precision:
-            error_msg += f", precision '{args.precision}'"
-        if args.framework:
-            error_msg += f", framework '{args.framework}'"
-        raise ValueError(error_msg + ".")
-
-    return matrix_values
-
-
-def load_config_files(config_files):
-    """Load and merge configuration files."""
-    all_config_data = {}
-    for config_file in config_files:
-        try:
-            with open(config_file, 'r') as f:
-                config_data = yaml.safe_load(f)
-                assert isinstance(
-                    config_data, dict), f"Config file '{config_file}' must contain a dictionary"
-
-                # Check for duplicate keys, this is only in place to prevent against the very unlikely
-                # case where an entry in one config accidentally/purposefully tries to override an entry in another config
-                duplicate_keys = set(all_config_data.keys()) & set(
-                    config_data.keys())
-                if duplicate_keys:
-                    raise ValueError(
-                        f"Duplicate configuration keys found in '{config_file}': {', '.join(sorted(duplicate_keys))}"
-                    )
-
-                all_config_data.update(config_data)
-        except FileNotFoundError:
-            raise ValueError(f"Input file '{config_file}' does not exist.")
-
-    return all_config_data
-
-
-def main():
-    # Create parent parser with common arguments
-    parent_parser = argparse.ArgumentParser(add_help=False)
-    parent_parser.add_argument(
-        '--config-files',
-        nargs='+',
-        required=True,
-        help='One or more configuration files (YAML format)'
-    )
-    parent_parser.add_argument(
-        '--run-evals',
-        action='store_true',
-        help='Opt-in flag to mark a subset of generated configs to run evals. When omitted, no evals run.'
-    )
-
-    # Create main parser
-    parser = argparse.ArgumentParser(
-        description='Generate benchmark configurations from YAML config files'
-    )
-
-    # Create subparsers for subcommands
-    subparsers = parser.add_subparsers(
-        dest='command',
-        required=True,
-        help='Available commands'
-    )
-
-    # Subcommand: full-sweep
-    full_sweep_parser = subparsers.add_parser(
-        'full-sweep',
-        parents=[parent_parser],
-        add_help=False,
-        help='Generate full sweep configurations with optional filtering by model, precision, framework, runner type, and sequence lengths'
-    )
-    full_sweep_parser.add_argument(
-        '--model-prefix',
-        nargs='+',
-        required=False,
-        help='Model prefix(es) to filter configurations (optional, can specify multiple)'
-    )
-    full_sweep_parser.add_argument(
-        '--precision',
-        nargs='+',
-        required=False,
-        help='Precision(s) to filter by (e.g., fp4, fp8) (optional, can specify multiple)'
-    )
-    full_sweep_parser.add_argument(
-        '--framework',
-        nargs='+',
-        required=False,
-        help='Framework(s) to filter by (e.g., vllm, trt, sglang) (optional, can specify multiple)'
-    )
-    full_sweep_parser.add_argument(
-        '--runner-type',
-        nargs='+',
-        required=False,
-        help='Runner type(s) to filter by (e.g., h200, h100) (optional, can specify multiple)'
-    )
-    full_sweep_parser.add_argument(
-        '--runner-config',
-        required=False,
-        help='Configuration file holding runner information (required if --runner-type is specified)'
-    )
-    full_sweep_parser.add_argument(
-        '--seq-lens',
-        nargs='+',
-        choices=list(seq_len_stoi.keys()),
-        required=False,
-        help=f"Sequence length configurations to include: {', '.join(seq_len_stoi.keys())}. If not specified, all sequence lengths are included."
-    )
-    full_sweep_parser.add_argument(
-        '--step-size',
-        type=int,
-        default=2,
-        help='Step size for concurrency values (default: 2)'
-    )
-    full_sweep_parser.add_argument(
-        '--test-mode',
-        action='store_true',
-        help='Test mode: only run highest TP with lowest concurrency for each matching config'
-    )
-    full_sweep_parser.add_argument(
-        '-h', '--help',
-        action='help',
-        help='Show this help message and exit'
-    )
-
-    # Subcommand: test-config
-    test_config_parser = subparsers.add_parser(
-        'test-config',
-        parents=[parent_parser],
-        add_help=False,
-        help='Given a config key, run that configuration as specified. Optionally specify --test-mode to only run one parallelism-concurrency pair for the config.'
-    )
-    test_config_parser.add_argument(
-        '--runner-config',
-        required=True,
-        help='Configuration file holding runner information'
-    )
-    test_config_parser.add_argument(
-        '--key',
-        required=True,
-        help='Configuration key to use'
-    )
-    test_config_parser.add_argument(
-        '--runner-node',
-        required=False,
-        help='Specific runner node to use'
-    )
-    test_config_parser.add_argument(
-        '--seq-lens',
-        nargs='+',
-        choices=list(seq_len_stoi.keys()),
-        required=False,
-        help=f"Sequence length configurations to include: {', '.join(seq_len_stoi.keys())}. If not specified, all sequence lengths are included."
-    )
-    test_config_parser.add_argument(
-        '--step-size',
-        type=int,
-        default=2,
-        help='Step size for concurrency values (default: 2)'
-    )
-    test_config_parser.add_argument(
-        '--test-mode',
-        action='store_true',
-        help='Generate only the lowest concurrency value for each TP level'
-    )
-    test_config_parser.add_argument(
-        '-h', '--help',
-        action='help',
-        help='Show this help message and exit'
-    )
-
-    # Subcommand: runner-model-sweep
-    test_config_parser = subparsers.add_parser(
-        'runner-model-sweep',
-        parents=[parent_parser],
-        add_help=False,
-        help='Given a runner type, find all configurations matching the type, and run that configuration on all individual runner nodes for the specified runner type. This is meant to validate that all runner nodes work on all configurations for a runner type. For instance, to validate that all configs that specify an h200 runner successfully run across all h200 runner nodes.'
-    )
-    test_config_parser.add_argument(
-        '--runner-type',
-        required=True,
-        help='Runner type (e.g., b200-trt, h100)'
-    )
-    test_config_parser.add_argument(
-        '--runner-config',
-        required=True,
-        help='Configuration file holding runner information'
-    )
-    test_config_parser.add_argument(
-        '--runner-node-filter',
-        required=False,
-        help='Filter runner nodes by substring match (e.g., "mi300x-amd" to only include nodes containing that string)'
-    )
-    test_config_parser.add_argument(
-        '-h', '--help',
-        action='help',
-        help='Show this help message and exit'
-    )
-
-    # Subcommand: runner-sweep
-    test_config_parser = subparsers.add_parser(
-        'runner-sweep',
-        parents=[parent_parser],
-        add_help=False,
-        help='Given a model (and optionally a precision and framework), find all configurations matching the inputs, and run those configurations across all compatible runner nodes. This is meant to validate all runner nodes that should run a particular model can. For instance, this should be used to validate that all runners nodes that should run gptoss-120b actually do so successfully.'
-    )
-    test_config_parser.add_argument(
-        '--runner-type',
-        required=True,
-        help='Runner type (e.g., b200-trt, h100)'
-    )
-    test_config_parser.add_argument(
-        '--model-prefix',
-        required=True,
-        help='Model prefix (e.g., 70b)'
-    )
-    test_config_parser.add_argument(
-        '--precision',
-        required=False,
-        help='Precision to filter by (e.g., fp4) (optional)'
-    )
-    test_config_parser.add_argument(
-        '--framework',
-        required=False,
-        help='Framework to filter by (e.g., trt) (optional)'
-    )
-    test_config_parser.add_argument(
-        '--runner-config',
-        required=True,
-        help='Configuration file holding runner information'
-    )
-    test_config_parser.add_argument(
-        '-h', '--help',
-        action='help',
-        help='Show this help message and exit'
-    )
-
-    # Subcommand: custom
-    test_config_parser = subparsers.add_parser(
-        'custom',
-        parents=[parent_parser],
-        add_help=False,
-        help='Enter custom values'
-    )
-    test_config_parser.add_argument(
-        '--runner-label',
-        required=True,
-        help='Label associated with runner on which to launch the corresponding job (e.g., h200, h200-nv_1, etc.)'
-    )
-    test_config_parser.add_argument(
-        '--image',
-        required=True,
-        help='Image to run the benchmark (e.g., openai/gpt-oss-120b)'
-    )
-    test_config_parser.add_argument(
-        '--model',
-        required=True,
-        help='Model to run (e.g., vllm/vllm-openai:latest)'
-    )
-    test_config_parser.add_argument(
-        '--framework',
-        required=True,
-        help='Framework to run on (e.g., vllm, trt, sglang)'
-    )
-    test_config_parser.add_argument(
-        '--precision',
-        required=True,
-        help='Precision to run (e.g., fp4, fp8)'
-    )
-    test_config_parser.add_argument(
-        '--exp-name',
-        required=True,
-        help='Experiment name (e.g., 70b_test)'
-    )
-    test_config_parser.add_argument(
-        '--runner-config',
-        required=True,
-        help='Configuration file holding runner information'
-    )
-    test_config_parser.add_argument(
-        '-h', '--help',
-        action='help',
-        help='Show this help message and exit'
-    )
-
-    args = parser.parse_args()
-
-    # Load and validate configuration files
-    all_config_data = load_config_files(args.config_files)
-    validate_master_configs_structure(all_config_data)
-
-    # Route to appropriate function based on subcommand
-    if args.command == 'full-sweep':
-        matrix_values = generate_full_sweep(args, all_config_data)
-    elif args.command == 'test-config':
-        matrix_values = generate_test_config(args, all_config_data)
-    elif args.command == 'runner-model-sweep':
-        matrix_values = generate_runner_model_sweep_config(
-            args, all_config_data)
-    elif args.command == 'runner-sweep':
-        matrix_values = generate_runner_sweep_config(
-            args, all_config_data)
-    elif args.command == 'custom':
-        matrix_values = generate_custom_test(args)
-    else:
-        parser.error(f"Unknown command: {args.command}")
-
-    # Choose eval (opt-in via --run-evals)
-    if args.run_evals:
-        matrix_values = mark_eval_entries(matrix_values)
-
-    # Validate output before printing
-    validate_matrix_output(matrix_values)
-
-    print(json.dumps(matrix_values))
-    return matrix_values
-
-
-if __name__ == "__main__":
-    main()
diff --git a/utils/matrix-logic/get_test_sweep_configs.py b/utils/matrix-logic/get_test_sweep_configs.py
deleted file mode 100644
index 4df4a51eb..000000000
--- a/utils/matrix-logic/get_test_sweep_configs.py
+++ /dev/null
@@ -1,151 +0,0 @@
-import json
-import yaml
-import sys
-import argparse
-
-seq_len_stoi = {
-    "1k1k": (1024, 1024),
-    "1k8k": (1024, 8192),
-    "8k1k": (8192, 1024)
-}
-
-def main():
-    parser = argparse.ArgumentParser(
-        description='Generate benchmark matrix from a specific configuration key'
-    )
-    parser.add_argument(
-        '--config-files',
-        nargs='+',
-        required=True,
-        help='One or more configuration files (YAML format)'
-    )
-    parser.add_argument(
-        '--key',
-        required=True,
-        help='Configuration key to use'
-    )
-    parser.add_argument(
-        '--seq-lens',
-        nargs='+',
-        choices=list(seq_len_stoi.keys()),
-        required=False,
-        help=f"Sequence length configurations to include: {', '.join(seq_len_stoi.keys())}. If not specified, all sequence lengths are included."
-    )
-    parser.add_argument(
-        '--step-size',
-        type=int,
-        default=2,
-        help='Step size for concurrency values (default: 2)'
-    )
-    
-    args = parser.parse_args()
-    
-    # Convert seq-lens to set of (isl, osl) tuples for filtering
-    seq_lens_filter = None
-    if args.seq_lens:
-        seq_lens_filter = {seq_len_stoi[sl] for sl in args.seq_lens}
-    
-    # Load and merge all config files
-    all_config_data = {}
-    for config_file in args.config_files:
-        try:
-            with open(config_file, 'r') as f:
-                config_data = yaml.safe_load(f)
-                assert isinstance(config_data, dict), f"Config file '{config_file}' must contain a dictionary"
-                
-                # Check for duplicate keys
-                duplicate_keys = set(all_config_data.keys()) & set(config_data.keys())
-                if duplicate_keys:
-                    raise ValueError(
-                        f"Duplicate configuration keys found in '{config_file}': {', '.join(sorted(duplicate_keys))}"
-                    )
-                
-                all_config_data.update(config_data)
-        except FileNotFoundError:
-            raise ValueError(f"Input file '{config_file}' does not exist.")
-    
-    # Check if the key exists
-    if args.key not in all_config_data:
-        available_keys = ', '.join(sorted(all_config_data.keys()))
-        raise ValueError(
-            f"Key '{args.key}' not found in configuration files. "
-            f"Available keys: {available_keys}"
-        )
-    
-    val = all_config_data[args.key]
-    
-    # Validate required fields
-    seq_len_configs = val.get('seq-len-configs')
-    assert seq_len_configs, f"Missing 'seq-len-configs' for key '{args.key}'"
-    
-    image = val.get('image')
-    model = val.get('model')
-    precision = val.get('precision')
-    framework = val.get('framework')
-    runner = val.get('runner')
-    
-    assert None not in (image, model, precision, framework, runner), \
-        f"Missing required fields (image, model, precision, framework, runner) for key '{args.key}'"
-    
-    matrix_values = []
-    
-    # Process each sequence length configuration
-    for seq_config in seq_len_configs:
-        isl = seq_config.get('isl')
-        osl = seq_config.get('osl')
-        
-        assert None not in (isl, osl), \
-            f"Missing 'isl' or 'osl' in seq-len-config for key '{args.key}'"
-        
-        # Filter by sequence lengths if specified
-        if seq_lens_filter and (isl, osl) not in seq_lens_filter:
-            continue
-        
-        bmk_space = seq_config.get('bmk-space')
-        assert bmk_space, f"Missing 'bmk-space' in seq-len-config for key '{args.key}'"
-        
-        for bmk in bmk_space:
-            tp = bmk.get('tp')
-            conc_start = bmk.get('conc-start')
-            conc_end = bmk.get('conc-end')
-            ep = bmk.get('ep')
-            dp_attn = bmk.get('dp-attn')
-            
-            assert None not in (tp, conc_start, conc_end), \
-                f"Missing 'tp', 'conc-start', or 'conc-end' in bmk-space for key '{args.key}'"
-            
-            # Generate entries for each concurrency value in the range
-            conc = conc_start
-            while conc <= conc_end:
-                entry = {
-                    'image': image,
-                    'model': model,
-                    'precision': precision,
-                    'framework': framework,
-                    'runner': runner,
-                    'isl': isl,
-                    'osl': osl,
-                    'tp': tp,
-                    'conc': conc,
-                    'max-model-len': isl + osl,
-                }
-                
-                # Add optional fields if they exist
-                if ep is not None:
-                    entry['ep'] = ep
-                if dp_attn is not None:
-                    entry['dp-attn'] = dp_attn
-                
-                matrix_values.append(entry)
-                
-                if conc == conc_end:
-                    break
-                conc *= args.step_size
-                if conc > conc_end:
-                    conc = conc_end
-    
-    print(json.dumps(matrix_values))
-    return matrix_values
-
-if __name__ == "__main__":
-    main()
diff --git a/utils/matrix-logic/test_generate_sweep_configs.py b/utils/matrix-logic/test_generate_sweep_configs.py
deleted file mode 100644
index c184ecbab..000000000
--- a/utils/matrix-logic/test_generate_sweep_configs.py
+++ /dev/null
@@ -1,1656 +0,0 @@
-import pytest
-import yaml
-from unittest.mock import patch
-from generate_sweep_configs import (
-    validate_master_configs_structure,
-    validate_matrix_output,
-    seq_len_to_str,
-    generate_full_sweep,
-    generate_test_config,
-    generate_runner_model_sweep_config,
-    generate_runner_sweep_config,
-    generate_custom_test,
-    load_config_files,
-    main,
-    MatrixEntry,
-)
-
-
-# Fixtures for test config files
-@pytest.fixture
-def sample_master_config():
-    """Sample master config with valid entries."""
-    return {
-        "70b-fp8-vllm": {
-            "image": "vllm/vllm-openai:v0.10.2",
-            "model": "meta-llama/Llama-3-70b",
-            "model-prefix": "70b",
-            "precision": "fp8",
-            "framework": "vllm",
-            "runner": "h200",
-            "seq-len-configs": [
-                {
-                    "isl": 1024,
-                    "osl": 1024,
-                    "search-space": [
-                        {"tp": 4, "conc-start": 1, "conc-end": 4},
-                        {"tp": 8, "conc-start": 2, "conc-end": 8, "ep": 2, "dp-attn": True}
-                    ]
-                },
-                {
-                    "isl": 1024,
-                    "osl": 8192,
-                    "search-space": [
-                        {"tp": 8, "conc-start": 1, "conc-end": 2}
-                    ]
-                }
-            ]
-        },
-        "8b-fp4-trt": {
-            "image": "nvcr.io/nvidia/tritonserver:24.01",
-            "model": "meta-llama/Llama-3-8b",
-            "model-prefix": "8b",
-            "precision": "fp4",
-            "framework": "trt",
-            "runner": "h100",
-            "seq-len-configs": [
-                {
-                    "isl": 1024,
-                    "osl": 1024,
-                    "search-space": [
-                        {"tp": 2, "conc-start": 4, "conc-end": 16}
-                    ]
-                }
-            ]
-        },
-        "gptoss-120b-fp8-vllm": {
-            "image": "vllm/vllm-openai:latest",
-            "model": "openai/gpt-oss-120b",
-            "model-prefix": "gptoss",
-            "precision": "fp8",
-            "framework": "vllm",
-            "runner": "h200-trt",
-            "seq-len-configs": [
-                {
-                    "isl": 1024,
-                    "osl": 1024,
-                    "search-space": [
-                        {"tp": 8, "conc-start": 1, "conc-end": 4}
-                    ]
-                }
-            ]
-        }
-    }
-
-
-@pytest.fixture
-def sample_runner_config():
-    """Sample runner config."""
-    return {
-        "h200": ["h200-nv_1", "h200-nv_2"],
-        "h100": ["h100-aws_1"],
-        "h200-trt": ["h200-trt_1", "h200-trt_2", "h200-trt_3"]
-    }
-
-
-@pytest.fixture
-def temp_config_files(tmp_path, sample_master_config, sample_runner_config):
-    """Create temporary config files."""
-    master_file = tmp_path / "master.yaml"
-    runner_file = tmp_path / "runners.yaml"
-
-    with open(master_file, 'w') as f:
-        yaml.dump(sample_master_config, f)
-
-    with open(runner_file, 'w') as f:
-        yaml.dump(sample_runner_config, f)
-
-    return str(master_file), str(runner_file)
-
-
-@pytest.fixture
-def invalid_master_config():
-    """Master config with validation errors."""
-    return {
-        "missing-field": {
-            "image": "test:latest",
-            "model": "test/model",
-            "model-prefix": "test",
-            # Missing precision, framework, runner, seq-len-configs
-        }
-    }
-
-
-# Tests for seq_len_to_str
-def test_seq_len_to_str_with_mapping():
-    """Test seq_len_to_str with known mappings."""
-    assert seq_len_to_str(1024, 1024) == "1k1k"
-    assert seq_len_to_str(1024, 8192) == "1k8k"
-    assert seq_len_to_str(8192, 1024) == "8k1k"
-
-
-def test_seq_len_to_str_without_mapping():
-    """Test seq_len_to_str fallback for unknown mappings."""
-    assert seq_len_to_str(2048, 4096) == "2048_4096"
-    assert seq_len_to_str(512, 512) == "512_512"
-
-
-# Tests for MatrixEntry validation
-def test_matrix_entry_valid():
-    """Test valid MatrixEntry."""
-    entry = {
-        "image": "test:latest",
-        "model": "test/model",
-        "precision": "fp8",
-        "framework": "vllm",
-        "runner": "h200",
-        "isl": 1024,
-        "osl": 1024,
-        "tp": 8,
-        "ep": 1,
-        "dp-attn": False,
-        "conc": 4,
-        "max-model-len": 2048,
-        "exp-name": "test_exp"
-    }
-    result = MatrixEntry(**entry)
-    assert result.image == "test:latest"
-    assert result.tp == 8
-
-
-def test_matrix_entry_missing_field():
-    """Test MatrixEntry with missing required field."""
-    entry = {
-        "image": "test:latest",
-        "model": "test/model",
-        # Missing other required fields
-    }
-    with pytest.raises(Exception):  # Pydantic ValidationError
-        MatrixEntry(**entry)
-
-
-def test_matrix_entry_wrong_type():
-    """Test MatrixEntry with wrong type."""
-    entry = {
-        "image": "test:latest",
-        "model": "test/model",
-        "precision": "fp8",
-        "framework": "vllm",
-        "runner": "h200",
-        "isl": "not_an_int",  # Wrong type
-        "osl": 1024,
-        "tp": 8,
-        "ep": 1,
-        "dp-attn": False,
-        "conc": 4,
-        "max-model-len": 2048,
-        "exp-name": "test_exp"
-    }
-    with pytest.raises(Exception):  # Pydantic ValidationError
-        MatrixEntry(**entry)
-
-
-def test_matrix_entry_extra_field():
-    """Test MatrixEntry with extra field (should be forbidden)."""
-    entry = {
-        "image": "test:latest",
-        "model": "test/model",
-        "precision": "fp8",
-        "framework": "vllm",
-        "runner": "h200",
-        "isl": 1024,
-        "osl": 1024,
-        "tp": 8,
-        "ep": 1,
-        "dp-attn": False,
-        "conc": 4,
-        "max-model-len": 2048,
-        "exp-name": "test_exp",
-        "extra-field": "should_fail"
-    }
-    with pytest.raises(Exception):  # Pydantic ValidationError
-        MatrixEntry(**entry)
-
-
-# Tests for validate_matrix_output
-def test_validate_matrix_output_valid():
-    """Test validate_matrix_output with valid entries."""
-    entries = [
-        {
-            "image": "test:latest",
-            "model": "test/model",
-            "precision": "fp8",
-            "framework": "vllm",
-            "runner": "h200",
-            "isl": 1024,
-            "osl": 1024,
-            "tp": 8,
-            "ep": 1,
-            "dp-attn": False,
-            "conc": 4,
-            "max-model-len": 2048,
-            "exp-name": "test_exp"
-        }
-    ]
-    result = validate_matrix_output(entries)
-    assert result == entries
-
-
-def test_validate_matrix_output_invalid():
-    """Test validate_matrix_output with invalid entry."""
-    entries = [
-        {
-            "image": "test:latest",
-            "model": "test/model",
-            # Missing required fields
-        }
-    ]
-    with pytest.raises(ValueError, match="Matrix entry at index 0 failed validation"):
-        validate_matrix_output(entries)
-
-
-def test_validate_matrix_output_multiple_entries():
-    """Test validate_matrix_output with multiple entries."""
-    entries = [
-        {
-            "image": "test:latest",
-            "model": "test/model",
-            "precision": "fp8",
-            "framework": "vllm",
-            "runner": "h200",
-            "isl": 1024,
-            "osl": 1024,
-            "tp": 8,
-            "ep": 1,
-            "dp-attn": False,
-            "conc": 4,
-            "max-model-len": 2048,
-            "exp-name": "test_exp"
-        },
-        {
-            "image": "test2:latest",
-            "model": "test2/model",
-            "precision": "fp4",
-            "framework": "trt",
-            "runner": "h100",
-            "isl": 1024,
-            "osl": 1024,
-            "tp": 4,
-            "ep": 2,
-            "dp-attn": True,
-            "conc": 8,
-            "max-model-len": 2048,
-            "exp-name": "test_exp2"
-        }
-    ]
-    result = validate_matrix_output(entries)
-    assert len(result) == 2
-
-
-# Tests for validate_master_configs_structure
-def test_validate_master_configs_structure_valid(sample_master_config):
-    """Test validation of valid master config."""
-    validate_master_configs_structure(sample_master_config)
-
-
-def test_validate_master_configs_structure_missing_field():
-    """Test validation with missing required field."""
-    config = {
-        "test-key": {
-            "image": "test:latest",
-            "model-prefix": "test",
-            # Missing other required fields
-        }
-    }
-    with pytest.raises(ValueError, match="Missing required field"):
-        validate_master_configs_structure(config)
-
-
-def test_validate_master_configs_structure_wrong_type():
-    """Test validation with wrong field type."""
-    config = {
-        "test-key": {
-            "image": 123,  # Should be string
-            "model": "test/model",
-            "model-prefix": "test",
-            "precision": "fp8",
-            "framework": "vllm",
-            "runner": "h200",
-            "seq-len-configs": []
-        }
-    }
-    with pytest.raises(ValueError, match="must be str"):
-        validate_master_configs_structure(config)
-
-
-def test_validate_master_configs_structure_empty_seq_len_configs():
-    """Test validation with empty seq-len-configs."""
-    config = {
-        "test-key": {
-            "image": "test:latest",
-            "model": "test/model",
-            "model-prefix": "test",
-            "precision": "fp8",
-            "framework": "vllm",
-            "runner": "h200",
-            "seq-len-configs": []
-        }
-    }
-    with pytest.raises(ValueError, match="must be a non-empty list"):
-        validate_master_configs_structure(config)
-
-
-def test_validate_master_configs_structure_invalid_search_space():
-    """Test validation with invalid search-space."""
-    config = {
-        "test-key": {
-            "image": "test:latest",
-            "model": "test/model",
-            "model-prefix": "test",
-            "precision": "fp8",
-            "framework": "vllm",
-            "runner": "h200",
-            "seq-len-configs": [
-                {
-                    "isl": 1024,
-                    "osl": 1024,
-                    "search-space": [
-                        {"tp": 8}  # Missing conc-start and conc-end
-                    ]
-                }
-            ]
-        }
-    }
-    with pytest.raises(ValueError, match="Missing 'conc-start'"):
-        validate_master_configs_structure(config)
-
-
-def test_validate_master_configs_structure_missing_search_space():
-    """Test validation with missing search-space."""
-    config = {
-        "test-key": {
-            "image": "test:latest",
-            "model": "test/model",
-            "model-prefix": "test",
-            "precision": "fp8",
-            "framework": "vllm",
-            "runner": "h200",
-            "seq-len-configs": [
-                {
-                    "isl": 1024,
-                    "osl": 1024
-                    # Missing search-space
-                }
-            ]
-        }
-    }
-    with pytest.raises(ValueError, match="Missing or invalid 'search-space'"):
-        validate_master_configs_structure(config)
-
-
-def test_validate_master_configs_structure_search_space_not_list():
-    """Test validation with search-space not being a list."""
-    config = {
-        "test-key": {
-            "image": "test:latest",
-            "model": "test/model",
-            "model-prefix": "test",
-            "precision": "fp8",
-            "framework": "vllm",
-            "runner": "h200",
-            "seq-len-configs": [
-                {
-                    "isl": 1024,
-                    "osl": 1024,
-                    "search-space": "not_a_list"
-                }
-            ]
-        }
-    }
-    with pytest.raises(ValueError, match="Missing or invalid 'search-space'"):
-        validate_master_configs_structure(config)
-
-
-def test_validate_master_configs_structure_extra_fields_in_search_space():
-    """Test validation with extra fields in search-space."""
-    config = {
-        "test-key": {
-            "image": "test:latest",
-            "model": "test/model",
-            "model-prefix": "test",
-            "precision": "fp8",
-            "framework": "vllm",
-            "runner": "h200",
-            "seq-len-configs": [
-                {
-                    "isl": 1024,
-                    "osl": 1024,
-                    "search-space": [
-                        {
-                            "tp": 8,
-                            "conc-start": 1,
-                            "conc-end": 4,
-                            "invalid-field": "value"
-                        }
-                    ]
-                }
-            ]
-        }
-    }
-    with pytest.raises(ValueError, match="Extra fields"):
-        validate_master_configs_structure(config)
-
-
-def test_validate_master_configs_structure_missing_isl():
-    """Test validation with missing isl."""
-    config = {
-        "test-key": {
-            "image": "test:latest",
-            "model": "test/model",
-            "model-prefix": "test",
-            "precision": "fp8",
-            "framework": "vllm",
-            "runner": "h200",
-            "seq-len-configs": [
-                {
-                    "osl": 1024,
-                    "search-space": [{"tp": 8, "conc-start": 1, "conc-end": 4}]
-                }
-            ]
-        }
-    }
-    with pytest.raises(ValueError, match="Missing 'isl'"):
-        validate_master_configs_structure(config)
-
-
-def test_validate_master_configs_structure_wrong_isl_type():
-    """Test validation with wrong isl type."""
-    config = {
-        "test-key": {
-            "image": "test:latest",
-            "model": "test/model",
-            "model-prefix": "test",
-            "precision": "fp8",
-            "framework": "vllm",
-            "runner": "h200",
-            "seq-len-configs": [
-                {
-                    "isl": "not_int",
-                    "osl": 1024,
-                    "search-space": [{"tp": 8, "conc-start": 1, "conc-end": 4}]
-                }
-            ]
-        }
-    }
-    with pytest.raises(ValueError, match="'isl' must be int"):
-        validate_master_configs_structure(config)
-
-
-def test_validate_master_configs_structure_missing_osl():
-    """Test validation with missing osl."""
-    config = {
-        "test-key": {
-            "image": "test:latest",
-            "model": "test/model",
-            "model-prefix": "test",
-            "precision": "fp8",
-            "framework": "vllm",
-            "runner": "h200",
-            "seq-len-configs": [
-                {
-                    "isl": 1024,
-                    "search-space": [{"tp": 8, "conc-start": 1, "conc-end": 4}]
-                }
-            ]
-        }
-    }
-    with pytest.raises(ValueError, match="Missing 'osl'"):
-        validate_master_configs_structure(config)
-
-
-def test_validate_master_configs_structure_wrong_osl_type():
-    """Test validation with wrong osl type."""
-    config = {
-        "test-key": {
-            "image": "test:latest",
-            "model": "test/model",
-            "model-prefix": "test",
-            "precision": "fp8",
-            "framework": "vllm",
-            "runner": "h200",
-            "seq-len-configs": [
-                {
-                    "isl": 1024,
-                    "osl": "not_int",
-                    "search-space": [{"tp": 8, "conc-start": 1, "conc-end": 4}]
-                }
-            ]
-        }
-    }
-    with pytest.raises(ValueError, match="'osl' must be int"):
-        validate_master_configs_structure(config)
-
-
-def test_validate_master_configs_structure_wrong_tp_type():
-    """Test validation with wrong tp type."""
-    config = {
-        "test-key": {
-            "image": "test:latest",
-            "model": "test/model",
-            "model-prefix": "test",
-            "precision": "fp8",
-            "framework": "vllm",
-            "runner": "h200",
-            "seq-len-configs": [
-                {
-                    "isl": 1024,
-                    "osl": 1024,
-                    "search-space": [{"tp": "not_int", "conc-start": 1, "conc-end": 4}]
-                }
-            ]
-        }
-    }
-    with pytest.raises(ValueError, match="'tp' must be int"):
-        validate_master_configs_structure(config)
-
-
-def test_validate_master_configs_structure_wrong_conc_start_type():
-    """Test validation with wrong conc-start type."""
-    config = {
-        "test-key": {
-            "image": "test:latest",
-            "model": "test/model",
-            "model-prefix": "test",
-            "precision": "fp8",
-            "framework": "vllm",
-            "runner": "h200",
-            "seq-len-configs": [
-                {
-                    "isl": 1024,
-                    "osl": 1024,
-                    "search-space": [{"tp": 8, "conc-start": "not_int", "conc-end": 4}]
-                }
-            ]
-        }
-    }
-    with pytest.raises(ValueError, match="'conc-start' must be int"):
-        validate_master_configs_structure(config)
-
-
-def test_validate_master_configs_structure_wrong_conc_end_type():
-    """Test validation with wrong conc-end type."""
-    config = {
-        "test-key": {
-            "image": "test:latest",
-            "model": "test/model",
-            "model-prefix": "test",
-            "precision": "fp8",
-            "framework": "vllm",
-            "runner": "h200",
-            "seq-len-configs": [
-                {
-                    "isl": 1024,
-                    "osl": 1024,
-                    "search-space": [{"tp": 8, "conc-start": 1, "conc-end": "not_int"}]
-                }
-            ]
-        }
-    }
-    with pytest.raises(ValueError, match="'conc-end' must be int"):
-        validate_master_configs_structure(config)
-
-
-def test_validate_master_configs_structure_wrong_ep_type():
-    """Test validation with wrong ep type."""
-    config = {
-        "test-key": {
-            "image": "test:latest",
-            "model": "test/model",
-            "model-prefix": "test",
-            "precision": "fp8",
-            "framework": "vllm",
-            "runner": "h200",
-            "seq-len-configs": [
-                {
-                    "isl": 1024,
-                    "osl": 1024,
-                    "search-space": [{"tp": 8, "conc-start": 1, "conc-end": 4, "ep": "not_int"}]
-                }
-            ]
-        }
-    }
-    with pytest.raises(ValueError, match="'ep' must be int"):
-        validate_master_configs_structure(config)
-
-
-def test_validate_master_configs_structure_wrong_dp_attn_type():
-    """Test validation with wrong dp-attn type."""
-    config = {
-        "test-key": {
-            "image": "test:latest",
-            "model": "test/model",
-            "model-prefix": "test",
-            "precision": "fp8",
-            "framework": "vllm",
-            "runner": "h200",
-            "seq-len-configs": [
-                {
-                    "isl": 1024,
-                    "osl": 1024,
-                    "search-space": [{"tp": 8, "conc-start": 1, "conc-end": 4, "dp-attn": "not_bool"}]
-                }
-            ]
-        }
-    }
-    with pytest.raises(ValueError, match="'dp-attn' must be bool"):
-        validate_master_configs_structure(config)
-
-
-# Tests for load_config_files
-def test_load_config_files_valid(temp_config_files):
-    """Test loading valid config files."""
-    master_file, _ = temp_config_files
-    result = load_config_files([master_file])
-    assert len(result) == 3
-    assert "70b-fp8-vllm" in result
-
-
-def test_load_config_files_multiple(tmp_path, sample_master_config):
-    """Test loading multiple config files."""
-    file1 = tmp_path / "config1.yaml"
-    file2 = tmp_path / "config2.yaml"
-
-    config1 = {"70b-fp8-vllm": sample_master_config["70b-fp8-vllm"]}
-    config2 = {"8b-fp4-trt": sample_master_config["8b-fp4-trt"]}
-
-    with open(file1, 'w') as f:
-        yaml.dump(config1, f)
-    with open(file2, 'w') as f:
-        yaml.dump(config2, f)
-
-    result = load_config_files([str(file1), str(file2)])
-    assert len(result) == 2
-
-
-def test_load_config_files_not_found():
-    """Test loading non-existent config file."""
-    with pytest.raises(ValueError, match="does not exist"):
-        load_config_files(["/nonexistent/file.yaml"])
-
-
-def test_load_config_files_duplicate_keys(tmp_path, sample_master_config):
-    """Test loading files with duplicate keys."""
-    file1 = tmp_path / "config1.yaml"
-    file2 = tmp_path / "config2.yaml"
-
-    config1 = {"70b-fp8-vllm": sample_master_config["70b-fp8-vllm"]}
-    config2 = {"70b-fp8-vllm": sample_master_config["70b-fp8-vllm"]}  # Duplicate
-
-    with open(file1, 'w') as f:
-        yaml.dump(config1, f)
-    with open(file2, 'w') as f:
-        yaml.dump(config2, f)
-
-    with pytest.raises(ValueError, match="Duplicate configuration keys"):
-        load_config_files([str(file1), str(file2)])
-
-
-# Tests for generate_full_sweep
-def test_generate_full_sweep_basic(sample_master_config, temp_config_files):
-    """Test basic full sweep generation."""
-    _, runner_file = temp_config_files
-
-    class Args:
-        model_prefix = ["70b"]
-        seq_lens = ["1k1k"]
-        step_size = 2
-        precision = None
-        framework = None
-        runner_type = None
-        test_mode = False
-        runner_config = runner_file
-
-    result = generate_full_sweep(Args(), sample_master_config)
-    assert len(result) > 0
-    assert all(entry['exp-name'].startswith('70b_1k1k') for entry in result)
-    assert all(entry['isl'] == 1024 and entry['osl'] == 1024 for entry in result)
-
-
-def test_generate_full_sweep_with_optionals(sample_master_config, temp_config_files):
-    """Test full sweep with optional ep and dp-attn."""
-    _, runner_file = temp_config_files
-
-    class Args:
-        model_prefix = ["70b"]
-        seq_lens = ["1k1k"]
-        step_size = 2
-        precision = None
-        framework = None
-        runner_type = None
-        test_mode = False
-        runner_config = runner_file
-
-    result = generate_full_sweep(Args(), sample_master_config)
-    # Find entry with tp=8 which should have ep=2 and dp-attn=True
-    tp8_entries = [e for e in result if e['tp'] == 8]
-    assert len(tp8_entries) > 0
-    assert all(e['ep'] == 2 for e in tp8_entries)
-    assert all(e['dp-attn'] == True for e in tp8_entries)
-
-
-def test_generate_full_sweep_no_matches(sample_master_config, temp_config_files):
-    """Test full sweep with no matching configs."""
-    _, runner_file = temp_config_files
-
-    class Args:
-        model_prefix = ["nonexistent"]
-        seq_lens = ["1k1k"]
-        step_size = 2
-        precision = None
-        framework = None
-        runner_type = None
-        test_mode = False
-        runner_config = runner_file
-
-    with pytest.raises(ValueError, match="No configs found matching filters"):
-        generate_full_sweep(Args(), sample_master_config)
-
-
-def test_generate_full_sweep_different_seq_len(sample_master_config, temp_config_files):
-    """Test full sweep with different sequence length."""
-    _, runner_file = temp_config_files
-
-    class Args:
-        model_prefix = ["70b"]
-        seq_lens = ["1k8k"]
-        step_size = 2
-        precision = None
-        framework = None
-        runner_type = None
-        test_mode = False
-        runner_config = runner_file
-
-    result = generate_full_sweep(Args(), sample_master_config)
-    assert len(result) > 0
-    assert all(entry['isl'] == 1024 and entry['osl'] == 8192 for entry in result)
-
-
-def test_generate_full_sweep_step_size(sample_master_config, temp_config_files):
-    """Test full sweep with different step size."""
-    _, runner_file = temp_config_files
-
-    class Args:
-        model_prefix = ["8b"]
-        seq_lens = ["1k1k"]
-        step_size = 4
-        precision = None
-        framework = None
-        runner_type = None
-        test_mode = False
-        runner_config = runner_file
-
-    result = generate_full_sweep(Args(), sample_master_config)
-    # Should have entries at conc=4, 8, 16 (step_size=4, conc-start=4, conc-end=16)
-    conc_values = sorted(set(e['conc'] for e in result))
-    assert 4 in conc_values
-    assert 16 in conc_values
-
-
-def test_generate_full_sweep_seq_len_not_in_config(temp_config_files):
-    """Test full sweep when requested seq-len is not in config."""
-    _, runner_file = temp_config_files
-
-    config = {
-        "test-fp8-vllm": {
-            "image": "test:latest",
-            "model": "test/model",
-            "model-prefix": "test",
-            "precision": "fp8",
-            "framework": "vllm",
-            "runner": "h200",
-            "seq-len-configs": [
-                {
-                    "isl": 8192,
-                    "osl": 1024,  # Only has 8k1k, not 1k1k
-                    "search-space": [
-                        {"tp": 4, "conc-start": 1, "conc-end": 4}
-                    ]
-                }
-            ]
-        }
-    }
-
-    class Args:
-        model_prefix = ["test"]
-        seq_lens = ["1k1k"]  # Requesting 1k1k but config only has 8k1k
-        step_size = 2
-        precision = None
-        framework = None
-        runner_type = None
-        test_mode = False
-        runner_config = runner_file
-
-    # Should raise error since no matching seq-len
-    with pytest.raises(ValueError, match="No configs found matching filters"):
-        generate_full_sweep(Args(), config)
-
-
-def test_generate_full_sweep_concurrency_overshoot(temp_config_files):
-    """Test full sweep when concurrency step overshoots end value."""
-    _, runner_file = temp_config_files
-
-    config = {
-        "test-fp8-vllm": {
-            "image": "test:latest",
-            "model": "test/model",
-            "model-prefix": "test",
-            "precision": "fp8",
-            "framework": "vllm",
-            "runner": "h200",
-            "seq-len-configs": [
-                {
-                    "isl": 1024,
-                    "osl": 1024,
-                    "search-space": [
-                        {"tp": 4, "conc-start": 1, "conc-end": 5}  # 1, 3*2=6 overshoots, clamps to 5
-                    ]
-                }
-            ]
-        }
-    }
-
-    class Args:
-        model_prefix = ["test"]
-        seq_lens = ["1k1k"]
-        step_size = 3  # Will overshoot: 1, 3, 9 (clamped to 5)
-        precision = None
-        framework = None
-        runner_type = None
-        test_mode = False
-        runner_config = runner_file
-
-    result = generate_full_sweep(Args(), config)
-    conc_values = sorted(set(e['conc'] for e in result))
-    # Should have 1, 3, 5 (5 is the clamped value)
-    assert conc_values == [1, 3, 5]
-
-
-# Tests for generate_full_sweep with filters
-def test_generate_full_sweep_no_filters(sample_master_config, temp_config_files):
-    """Test filtered sweep with no filters."""
-    _, runner_file = temp_config_files
-
-    class Args:
-        model_prefix = None
-        precision = None
-        framework = None
-        runner_type = None
-        seq_lens = None
-        step_size = 2
-        test_mode = False
-        runner_config = runner_file
-
-    result = generate_full_sweep(Args(), sample_master_config)
-    assert len(result) > 0
-
-
-def test_generate_full_sweep_with_filters_model_prefix(sample_master_config, temp_config_files):
-    """Test filtered sweep with model prefix filter."""
-    _, runner_file = temp_config_files
-
-    class Args:
-        model_prefix = ["70b"]
-        precision = None
-        framework = None
-        runner_type = None
-        seq_lens = None
-        step_size = 2
-        test_mode = False
-        runner_config = runner_file
-
-    result = generate_full_sweep(Args(), sample_master_config)
-    assert all("70b" in entry['exp-name'] for entry in result)
-
-
-def test_generate_full_sweep_with_filters_multiple_filters(sample_master_config, temp_config_files):
-    """Test filtered sweep with multiple filters."""
-    _, runner_file = temp_config_files
-
-    class Args:
-        model_prefix = ["70b"]
-        precision = ["fp8"]
-        framework = ["vllm"]
-        runner_type = None
-        seq_lens = ["1k1k"]
-        step_size = 2
-        test_mode = False
-        runner_config = runner_file
-
-    result = generate_full_sweep(Args(), sample_master_config)
-    assert len(result) > 0
-    assert all(entry['precision'] == 'fp8' for entry in result)
-    assert all(entry['framework'] == 'vllm' for entry in result)
-
-
-def test_generate_full_sweep_with_filters_test_mode(sample_master_config, temp_config_files):
-    """Test filtered sweep in test mode."""
-    _, runner_file = temp_config_files
-
-    class Args:
-        model_prefix = ["70b"]
-        precision = None
-        framework = None
-        runner_type = None
-        seq_lens = ["1k1k"]
-        step_size = 2
-        test_mode = True
-        runner_config = runner_file
-
-    result = generate_full_sweep(Args(), sample_master_config)
-    # In test mode, should only get one entry per seq-len (highest TP, lowest conc)
-    assert len(result) == 1  # Only one config matches 70b with 1k1k
-    assert result[0]['tp'] == 8  # Highest TP
-    assert '70b_1k1k' in result[0]['exp-name']
-
-
-def test_generate_full_sweep_with_filters_runner_type_validation(sample_master_config, temp_config_files):
-    """Test filtered sweep with invalid runner type."""
-    _, runner_file = temp_config_files
-
-    class Args:
-        model_prefix = None
-        precision = None
-        framework = None
-        runner_type = ["invalid-runner"]
-        seq_lens = None
-        step_size = 2
-        test_mode = False
-        runner_config = runner_file
-
-    with pytest.raises(ValueError, match="Invalid runner type"):
-        generate_full_sweep(Args(), sample_master_config)
-
-
-def test_generate_full_sweep_with_filters_runner_type_no_config(sample_master_config):
-    """Test filtered sweep with runner type but no config file."""
-    class Args:
-        model_prefix = None
-        precision = None
-        framework = None
-        runner_type = ["h200"]
-        seq_lens = None
-        step_size = 2
-        test_mode = False
-        runner_config = None
-
-    with pytest.raises(ValueError, match="runner-config is required"):
-        generate_full_sweep(Args(), sample_master_config)
-
-
-def test_generate_full_sweep_with_filters_multiple_runner_types(sample_master_config, temp_config_files):
-    """Test filtered sweep with multiple runner types."""
-    _, runner_file = temp_config_files
-
-    class Args:
-        model_prefix = None
-        precision = None
-        framework = None
-        runner_type = ["h200", "h100"]
-        seq_lens = ["1k1k"]
-        step_size = 2
-        test_mode = False
-        runner_config = runner_file
-
-    result = generate_full_sweep(Args(), sample_master_config)
-    runners = set(entry['runner'] for entry in result)
-    assert 'h200' in runners or 'h100' in runners
-
-
-def test_generate_full_sweep_with_filters_no_matches(sample_master_config, temp_config_files):
-    """Test filtered sweep with no matching configs."""
-    _, runner_file = temp_config_files
-
-    class Args:
-        model_prefix = ["nonexistent"]
-        precision = None
-        framework = None
-        runner_type = None
-        seq_lens = None
-        step_size = 2
-        test_mode = False
-        runner_config = runner_file
-
-    with pytest.raises(ValueError, match="No configs found matching filters"):
-        generate_full_sweep(Args(), sample_master_config)
-
-
-def test_generate_full_sweep_with_filters_concurrency_overshoot(temp_config_files):
-    """Test filtered sweep when concurrency step overshoots end value."""
-    _, runner_file = temp_config_files
-
-    config = {
-        "test-fp8-vllm": {
-            "image": "test:latest",
-            "model": "test/model",
-            "model-prefix": "test",
-            "precision": "fp8",
-            "framework": "vllm",
-            "runner": "h200",
-            "seq-len-configs": [
-                {
-                    "isl": 1024,
-                    "osl": 1024,
-                    "search-space": [
-                        {"tp": 4, "conc-start": 2, "conc-end": 7}  # 2, 8 overshoots, clamps to 7
-                    ]
-                }
-            ]
-        }
-    }
-
-    class Args:
-        model_prefix = None
-        precision = None
-        framework = None
-        runner_type = None
-        seq_lens = None
-        step_size = 4  # Will overshoot: 2, 8 (clamped to 7)
-        test_mode = False
-        runner_config = runner_file
-
-    result = generate_full_sweep(Args(), config)
-    conc_values = sorted(set(e['conc'] for e in result))
-    # Should have 2, 7 (7 is the clamped value)
-    assert 2 in conc_values
-    assert 7 in conc_values
-
-
-# Tests for generate_test_config
-def test_generate_test_config_basic(sample_master_config, temp_config_files):
-    """Test basic test config generation."""
-    _, runner_file = temp_config_files
-
-    class Args:
-        key = "70b-fp8-vllm"
-        runner_config = runner_file
-        runner_node = "h200-nv_1"
-        seq_lens = None
-        step_size = 2
-        test_mode = False
-
-    result = generate_test_config(Args(), sample_master_config)
-    assert len(result) > 0
-
-
-def test_generate_test_config_test_mode(sample_master_config, temp_config_files):
-    """Test test config in test mode."""
-    _, runner_file = temp_config_files
-
-    class Args:
-        key = "70b-fp8-vllm"
-        runner_config = runner_file
-        runner_node = "h200-nv_1"
-        seq_lens = ["1k1k"]
-        step_size = 2
-        test_mode = True
-
-    result = generate_test_config(Args(), sample_master_config)
-    # In test mode, should only use lowest concurrency
-    assert all(entry['conc'] == 1 or entry['conc'] == 2 for entry in result)
-
-
-def test_generate_test_config_specific_runner_node(sample_master_config, temp_config_files):
-    """Test test config with specific runner node."""
-    _, runner_file = temp_config_files
-
-    class Args:
-        key = "70b-fp8-vllm"
-        runner_config = runner_file
-        runner_node = "h200-nv_1"
-        seq_lens = None
-        step_size = 2
-        test_mode = False
-
-    result = generate_test_config(Args(), sample_master_config)
-    assert all(entry['runner'] == 'h200-nv_1' for entry in result)
-
-
-def test_generate_test_config_invalid_key(sample_master_config, temp_config_files):
-    """Test test config with invalid key."""
-    _, runner_file = temp_config_files
-
-    class Args:
-        key = "nonexistent-key"
-        runner_config = runner_file
-        runner_node = None
-        seq_lens = None
-        step_size = 2
-        test_mode = False
-
-    with pytest.raises(ValueError, match="does not exist in config files"):
-        generate_test_config(Args(), sample_master_config)
-
-
-def test_generate_test_config_invalid_runner_node(sample_master_config, temp_config_files):
-    """Test test config with invalid runner node."""
-    _, runner_file = temp_config_files
-
-    class Args:
-        key = "70b-fp8-vllm"
-        runner_config = runner_file
-        runner_node = "invalid-node"
-        seq_lens = None
-        step_size = 2
-        test_mode = False
-
-    with pytest.raises(ValueError, match="is not compatible"):
-        generate_test_config(Args(), sample_master_config)
-
-
-def test_generate_test_config_missing_runner_config(sample_master_config):
-    """Test test config with missing runner config file."""
-    class Args:
-        key = "70b-fp8-vllm"
-        runner_config = "/nonexistent/file.yaml"
-        runner_node = None
-        seq_lens = None
-        step_size = 2
-        test_mode = False
-
-    with pytest.raises(ValueError, match="does not exist"):
-        generate_test_config(Args(), sample_master_config)
-
-
-def test_generate_test_config_concurrency_overshoot(temp_config_files):
-    """Test test config when concurrency step overshoots end value."""
-    _, runner_file = temp_config_files
-
-    config = {
-        "test-fp8-vllm": {
-            "image": "test:latest",
-            "model": "test/model",
-            "model-prefix": "test",
-            "precision": "fp8",
-            "framework": "vllm",
-            "runner": "h200",
-            "seq-len-configs": [
-                {
-                    "isl": 1024,
-                    "osl": 1024,
-                    "search-space": [
-                        {"tp": 4, "conc-start": 1, "conc-end": 6}
-                    ]
-                }
-            ]
-        }
-    }
-
-    class Args:
-        key = "test-fp8-vllm"
-        runner_config = runner_file
-        runner_node = "h200-nv_1"
-        seq_lens = None
-        step_size = 4  # Will overshoot: 1, 4, 16 (clamped to 6)
-        test_mode = False
-
-    result = generate_test_config(Args(), config)
-    conc_values = sorted(set(e['conc'] for e in result))
-    assert 1 in conc_values
-    assert 4 in conc_values
-    assert 6 in conc_values
-
-
-# Tests for generate_runner_model_sweep_config
-def test_generate_runner_model_sweep_config(sample_master_config, temp_config_files):
-    """Test runner-model sweep config generation."""
-    _, runner_file = temp_config_files
-
-    class Args:
-        runner_type = "h200"
-        runner_config = runner_file
-        runner_node_filter = None
-
-    result = generate_runner_model_sweep_config(Args(), sample_master_config)
-    assert len(result) > 0
-    # Should have entries for each runner node under h200
-    runners = set(entry['runner'] for entry in result)
-    assert 'h200-nv_1' in runners
-    assert 'h200-nv_2' in runners
-
-
-def test_generate_runner_model_sweep_config_invalid_runner(sample_master_config, temp_config_files):
-    """Test runner-model sweep with invalid runner type."""
-    _, runner_file = temp_config_files
-
-    class Args:
-        runner_type = "invalid-runner"
-        runner_config = runner_file
-        runner_node_filter = None
-
-    with pytest.raises(ValueError, match="does not exist in runner config"):
-        generate_runner_model_sweep_config(Args(), sample_master_config)
-
-
-def test_generate_runner_model_sweep_config_with_node_filter(sample_master_config, temp_config_files):
-    """Test runner-model sweep with runner node filter."""
-    _, runner_file = temp_config_files
-
-    class Args:
-        runner_type = "h200"
-        runner_config = runner_file
-        runner_node_filter = "nv_1"
-
-    result = generate_runner_model_sweep_config(Args(), sample_master_config)
-    # Should only have entries for h200-nv_1
-    runners = set(entry['runner'] for entry in result)
-    assert 'h200-nv_1' in runners
-    assert 'h200-nv_2' not in runners
-
-
-def test_generate_runner_model_sweep_config_with_node_filter_multiple_matches(sample_master_config, temp_config_files):
-    """Test runner-model sweep with runner node filter matching multiple nodes."""
-    _, runner_file = temp_config_files
-
-    class Args:
-        runner_type = "h200"
-        runner_config = runner_file
-        runner_node_filter = "nv"  # Should match both nv_1 and nv_2
-
-    result = generate_runner_model_sweep_config(Args(), sample_master_config)
-    runners = set(entry['runner'] for entry in result)
-    assert 'h200-nv_1' in runners
-    assert 'h200-nv_2' in runners
-
-
-def test_generate_runner_model_sweep_config_with_node_filter_no_matches(sample_master_config, temp_config_files):
-    """Test runner-model sweep with runner node filter that matches no nodes."""
-    _, runner_file = temp_config_files
-
-    class Args:
-        runner_type = "h200"
-        runner_config = runner_file
-        runner_node_filter = "nonexistent"
-
-    with pytest.raises(ValueError, match="No runner nodes found matching filter"):
-        generate_runner_model_sweep_config(Args(), sample_master_config)
-
-
-def test_generate_runner_model_sweep_config_without_node_filter(sample_master_config, temp_config_files):
-    """Test runner-model sweep without runner node filter (default behavior)."""
-    _, runner_file = temp_config_files
-
-    class Args:
-        runner_type = "h200"
-        runner_config = runner_file
-        runner_node_filter = None
-
-    result = generate_runner_model_sweep_config(Args(), sample_master_config)
-    # Should have entries for all h200 nodes
-    runners = set(entry['runner'] for entry in result)
-    assert 'h200-nv_1' in runners
-    assert 'h200-nv_2' in runners
-
-
-# Tests for generate_runner_sweep_config
-def test_generate_runner_sweep_config(sample_master_config, temp_config_files):
-    """Test runner sweep config generation."""
-    _, runner_file = temp_config_files
-
-    class Args:
-        model_prefix = "70b"
-        runner_type = "h200"
-        precision = None
-        framework = None
-        runner_config = runner_file
-
-    result = generate_runner_sweep_config(Args(), sample_master_config)
-    assert len(result) > 0
-
-
-def test_generate_runner_sweep_config_with_filters(sample_master_config, temp_config_files):
-    """Test runner sweep with precision and framework filters."""
-    _, runner_file = temp_config_files
-
-    class Args:
-        model_prefix = "70b"
-        runner_type = "h200"
-        precision = "fp8"
-        framework = "vllm"
-        runner_config = runner_file
-
-    result = generate_runner_sweep_config(Args(), sample_master_config)
-    assert all(entry['precision'] == 'fp8' for entry in result)
-    assert all(entry['framework'] == 'vllm' for entry in result)
-
-
-def test_generate_runner_sweep_config_no_matches(sample_master_config, temp_config_files):
-    """Test runner sweep with no matching configs."""
-    _, runner_file = temp_config_files
-
-    class Args:
-        model_prefix = "nonexistent"
-        runner_type = "h200"
-        precision = None
-        framework = None
-        runner_config = runner_file
-
-    with pytest.raises(ValueError, match="No configs found matching"):
-        generate_runner_sweep_config(Args(), sample_master_config)
-
-
-# Tests for generate_custom_test
-def test_generate_custom_test(temp_config_files):
-    """Test custom test generation."""
-    _, runner_file = temp_config_files
-
-    class Args:
-        runner_label = "h200"
-        image = "vllm/vllm-openai:latest"
-        model = "test/model"
-        framework = "vllm"
-        precision = "fp8"
-        exp_name = "custom_test"
-        runner_config = runner_file
-
-    result = generate_custom_test(Args())
-    assert len(result) == 1
-    assert result[0]['image'] == "vllm/vllm-openai:latest"
-    assert result[0]['exp-name'] == "custom_test"
-
-
-def test_generate_custom_test_invalid_runner(temp_config_files):
-    """Test custom test with invalid runner label."""
-    _, runner_file = temp_config_files
-
-    class Args:
-        runner_label = "invalid-runner"
-        image = "vllm/vllm-openai:latest"
-        model = "test/model"
-        framework = "vllm"
-        precision = "fp8"
-        exp_name = "custom_test"
-        runner_config = runner_file
-
-    with pytest.raises(ValueError, match="Unable to find specified runner label"):
-        generate_custom_test(Args())
-
-
-# Tests for main function
-def test_main_full_sweep(temp_config_files):
-    """Test main function with full-sweep command."""
-    master_file, _ = temp_config_files
-
-    test_args = [
-        "generate_sweep_configs.py",
-        "full-sweep",
-        "--config-files", master_file,
-        "--seq-lens", "1k1k",
-        "--model-prefix", "70b",
-        "--step-size", "2"
-    ]
-
-    with patch('sys.argv', test_args):
-        result = main()
-        assert len(result) > 0
-
-
-def test_main_full_sweep_with_filters(temp_config_files):
-    """Test main function with full-sweep command with filters."""
-    master_file, runner_file = temp_config_files
-
-    test_args = [
-        "generate_sweep_configs.py",
-        "full-sweep",
-        "--config-files", master_file,
-        "--runner-config", runner_file,
-        "--model-prefix", "70b",
-        "--precision", "fp8",
-        "--test-mode"
-    ]
-
-    with patch('sys.argv', test_args):
-        result = main()
-        assert len(result) > 0
-
-
-def test_main_test_config(temp_config_files):
-    """Test main function with test-config command."""
-    master_file, runner_file = temp_config_files
-
-    test_args = [
-        "generate_sweep_configs.py",
-        "test-config",
-        "--config-files", master_file,
-        "--runner-config", runner_file,
-        "--key", "70b-fp8-vllm",
-        "--runner-node", "h200-nv_1",
-        "--test-mode"
-    ]
-
-    with patch('sys.argv', test_args):
-        result = main()
-        assert len(result) > 0
-
-
-def test_main_runner_model_sweep(temp_config_files):
-    """Test main function with runner-model-sweep command."""
-    master_file, runner_file = temp_config_files
-
-    test_args = [
-        "generate_sweep_configs.py",
-        "runner-model-sweep",
-        "--config-files", master_file,
-        "--runner-config", runner_file,
-        "--runner-type", "h200"
-    ]
-
-    with patch('sys.argv', test_args):
-        result = main()
-        assert len(result) > 0
-
-
-def test_main_runner_model_sweep_with_node_filter(temp_config_files):
-    """Test main function with runner-model-sweep command with node filter."""
-    master_file, runner_file = temp_config_files
-
-    test_args = [
-        "generate_sweep_configs.py",
-        "runner-model-sweep",
-        "--config-files", master_file,
-        "--runner-config", runner_file,
-        "--runner-type", "h200",
-        "--runner-node-filter", "nv_1"
-    ]
-
-    with patch('sys.argv', test_args):
-        result = main()
-        assert len(result) > 0
-        runners = set(entry['runner'] for entry in result)
-        assert 'h200-nv_1' in runners
-        assert 'h200-nv_2' not in runners
-
-
-def test_main_runner_sweep(temp_config_files):
-    """Test main function with runner-sweep command."""
-    master_file, runner_file = temp_config_files
-
-    test_args = [
-        "generate_sweep_configs.py",
-        "runner-sweep",
-        "--config-files", master_file,
-        "--runner-config", runner_file,
-        "--runner-type", "h200",
-        "--model-prefix", "70b"
-    ]
-
-    with patch('sys.argv', test_args):
-        result = main()
-        assert len(result) > 0
-
-
-def test_main_custom(temp_config_files):
-    """Test main function with custom command."""
-    master_file, runner_file = temp_config_files
-
-    test_args = [
-        "generate_sweep_configs.py",
-        "custom",
-        "--config-files", master_file,
-        "--runner-config", runner_file,
-        "--runner-label", "h200",
-        "--image", "test:latest",
-        "--model", "test/model",
-        "--framework", "vllm",
-        "--precision", "fp8",
-        "--exp-name", "custom_test"
-    ]
-
-    with patch('sys.argv', test_args):
-        result = main()
-        assert len(result) == 1
-
-
-def test_main_invalid_config_structure(tmp_path):
-    """Test main with invalid config structure."""
-    invalid_file = tmp_path / "invalid.yaml"
-    with open(invalid_file, 'w') as f:
-        yaml.dump({"key": {"image": "test"}}, f)  # Missing required fields
-
-    test_args = [
-        "generate_sweep_configs.py",
-        "full-sweep",
-        "--config-files", str(invalid_file),
-        "--seq-lens", "1k1k",
-        "--model-prefix", "test"
-    ]
-
-    with patch('sys.argv', test_args):
-        with pytest.raises(ValueError):
-            main()
-
-
-def test_main_validation_failure(temp_config_files, monkeypatch):
-    """Test main with validation failure on output."""
-    master_file, _ = temp_config_files
-
-    # Monkey patch validate_matrix_output to always fail
-    def mock_validate(entries):
-        raise ValueError("Validation failed")
-
-    monkeypatch.setattr('generate_sweep_configs.validate_matrix_output', mock_validate)
-
-    test_args = [
-        "generate_sweep_configs.py",
-        "full-sweep",
-        "--config-files", master_file,
-        "--seq-lens", "1k1k",
-        "--model-prefix", "70b"
-    ]
-
-    with patch('sys.argv', test_args):
-        with pytest.raises(ValueError, match="Validation failed"):
-            main()
-
-
-# Edge case tests
-def test_concurrency_step_reaches_exact_end(sample_master_config, temp_config_files):
-    """Test that concurrency stepping reaches exact end value."""
-    _, runner_file = temp_config_files
-
-    class Args:
-        model_prefix = ["8b"]
-        seq_lens = ["1k1k"]
-        step_size = 2
-        precision = None
-        framework = None
-        runner_type = None
-        test_mode = False
-        runner_config = runner_file
-
-    result = generate_full_sweep(Args(), sample_master_config)
-    # conc-start=4, conc-end=16, step=2 should give 4,8,16
-    conc_values = sorted(set(e['conc'] for e in result))
-    assert 16 in conc_values
-
-
-def test_multiple_model_prefixes_filtered_sweep(sample_master_config, temp_config_files):
-    """Test filtered sweep with multiple model prefixes."""
-    _, runner_file = temp_config_files
-
-    class Args:
-        model_prefix = ["70b", "8b"]
-        precision = None
-        framework = None
-        runner_type = None
-        seq_lens = ["1k1k"]
-        step_size = 2
-        test_mode = False
-        runner_config = runner_file
-
-    result = generate_full_sweep(Args(), sample_master_config)
-    exp_names = [e['exp-name'] for e in result]
-    assert any('70b' in name for name in exp_names)
-    assert any('8b' in name for name in exp_names)
-
-
-def test_seq_len_filter_multiple(sample_master_config, temp_config_files):
-    """Test filtering with multiple sequence lengths."""
-    _, runner_file = temp_config_files
-
-    class Args:
-        model_prefix = ["70b"]
-        precision = None
-        framework = None
-        runner_type = None
-        seq_lens = ["1k1k", "1k8k"]
-        step_size = 2
-        test_mode = False
-        runner_config = runner_file
-
-    result = generate_full_sweep(Args(), sample_master_config)
-    seq_lens = set((e['isl'], e['osl']) for e in result)
-    assert (1024, 1024) in seq_lens
-    assert (1024, 8192) in seq_lens
-
-
-def test_default_ep_dp_attn_values(sample_master_config, temp_config_files):
-    """Test that default ep and dp-attn values are set correctly."""
-    _, runner_file = temp_config_files
-
-    class Args:
-        model_prefix = ["8b"]
-        seq_lens = ["1k1k"]
-        step_size = 2
-        precision = None
-        framework = None
-        runner_type = None
-        test_mode = False
-        runner_config = runner_file
-
-    result = generate_full_sweep(Args(), sample_master_config)
-    # 8b config doesn't specify ep/dp-attn, so should use defaults
-    assert all(e['ep'] == 1 for e in result)
-    assert all(e['dp-attn'] == False for e in result)
-
-
-def test_max_model_len_calculation(sample_master_config, temp_config_files):
-    """Test that max-model-len is calculated correctly."""
-    _, runner_file = temp_config_files
-
-    class Args:
-        model_prefix = ["70b"]
-        seq_lens = ["1k8k"]
-        step_size = 2
-        precision = None
-        framework = None
-        runner_type = None
-        test_mode = False
-        runner_config = runner_file
-
-    result = generate_full_sweep(Args(), sample_master_config)
-    # isl=1024, osl=8192, so max-model-len should be 1024+8192+200=9416
-    assert all(e['max-model-len'] == 9416 for e in result)
-
-
-if __name__ == "__main__":
-    pytest.main([__file__, "-v", "--cov=generate_sweep_configs", "--cov-report=term-missing"])
diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py
new file mode 100644
index 000000000..cba89c448
--- /dev/null
+++ b/utils/matrix_logic/generate_sweep_configs.py
@@ -0,0 +1,748 @@
+import json
+import argparse
+import sys
+from pathlib import Path
+
+# Ensure sibling modules are importable regardless of how script is invoked
+sys.path.insert(0, str(Path(__file__).resolve().parent))
+
+from validation import (
+    validate_matrix_entry,
+    load_config_files,
+    load_runner_file,
+    Fields
+)
+
+seq_len_stoi = {
+    "1k1k": (1024, 1024),
+    "1k8k": (1024, 8192),
+    "8k1k": (8192, 1024)
+}
+
+# Reverse mapping for exp-name generation
+seq_len_itos = {v: k for k, v in seq_len_stoi.items()}
+
+
+def seq_len_to_str(isl: int, osl: int) -> str:
+    """Convert sequence lengths to short string representation.
+
+    Returns the short name (e.g., '1k1k') if it exists in the mapping,
+    otherwise returns 'isl_osl' format.
+    """
+    return seq_len_itos.get((isl, osl), f"{isl}_{osl}")
+
+def mark_eval_entries(matrix_values: list[dict]) -> list[dict]:
+    """Mark entries that should run evaluation.
+    
+    For each unique (model, runner, framework, precision, isl, osl) combination:
+    - Mark highest TP with highest conc
+    - Mark lowest TP with highest conc
+    """
+    from collections import defaultdict
+
+    # Group entries by (model, runner, framework, precision, isl, osl)
+    # This ensures we compare within the same configuration, not across different frameworks
+    groups = defaultdict(list)
+    for i, entry in enumerate(matrix_values):
+        key = (
+            entry[Fields.MODEL.value], 
+            entry[Fields.RUNNER.value], 
+            entry[Fields.FRAMEWORK.value],
+            entry[Fields.PRECISION.value],
+            entry[Fields.ISL.value], 
+            entry[Fields.OSL.value]
+        )
+        groups[key].append((i, entry))
+
+    # For each group, find highest TP/highest conc and lowest TP/highest conc
+    eval_indices = set()
+    for key, entries in groups.items():
+        if not entries:
+            continue
+
+        # Find min and max TP values
+        min_tp = min(e[Fields.TP.value] for _, e in entries)
+        max_tp = max(e[Fields.TP.value] for _, e in entries)
+
+        # Find highest conc for highest TP
+        highest_tp_entries = [(i, e) for i, e in entries if e[Fields.TP.value] == max_tp]
+        if highest_tp_entries:
+            max_conc_highest_tp = max(e[Fields.CONC.value] for _, e in highest_tp_entries)
+            for i, e in highest_tp_entries:
+                if e[Fields.CONC.value] == max_conc_highest_tp:
+                    eval_indices.add(i)
+
+        # Find highest conc for lowest TP (only if different from max_tp)
+        if min_tp != max_tp:
+            lowest_tp_entries = [(i, e) for i, e in entries if e[Fields.TP.value] == min_tp]
+            if lowest_tp_entries:
+                max_conc_lowest_tp = max(e[Fields.CONC.value] for _, e in lowest_tp_entries)
+                for i, e in lowest_tp_entries:
+                    if e[Fields.CONC.value] == max_conc_lowest_tp:
+                        eval_indices.add(i)
+
+    # Mark the selected entries
+    for i, entry in enumerate(matrix_values):
+        entry[Fields.FIELD_RUN_EVAL.value] = i in eval_indices
+
+    return matrix_values
+
+
+def generate_full_sweep(args, all_config_data, runner_data):
+    """Generate full sweep configurations with optional filtering.
+
+    Supports filtering by model prefix, precision, framework, runner type, sequence lengths,
+    and max concurrency.
+
+    All filters are optional - can generate sweeps for all configs or filter by specific criteria.
+
+    Assumes all_config_data has been validated by validate_master_config().
+    """
+    # Validate runner types if specified
+    if args.runner_type:
+        valid_runner_types = set(runner_data.keys())
+        invalid_runners = set(args.runner_type) - valid_runner_types
+        if invalid_runners:
+            raise ValueError(
+                f"Invalid runner type(s): {invalid_runners}. "
+                f"Valid runner types are: {', '.join(sorted(valid_runner_types))}")
+
+    matrix_values = []
+
+    # Convert seq-lens to set of (isl, osl) tuples for filtering
+    seq_lens_filter = None
+    if args.seq_lens:
+        seq_lens_filter = {seq_len_stoi[sl] for sl in args.seq_lens}
+
+    # Iterate through all configurations and apply filters as specified (this is just "selecting" 
+    # configs from all of the master configs subject to some pattern matching)
+    for key, val in all_config_data.items():
+        # Filter by model prefix if specified
+        if args.model_prefix:
+            if not any(key.startswith(prefix) for prefix in args.model_prefix):
+                continue
+
+        # Filter by precision if specified
+        if args.precision and val[Fields.PRECISION.value] not in args.precision:
+            continue
+
+        # Filter by framework if specified
+        if args.framework and val[Fields.FRAMEWORK.value] not in args.framework:
+            continue
+
+        # Filter by runner type if specified
+        if args.runner_type and val[Fields.RUNNER.value] not in args.runner_type:
+            continue
+
+        # Check if this is a multinode config
+        is_multinode = val.get(Fields.MULTINODE.value, False)
+        # Get disagg value, defaulting to False if not specified
+        disagg = val.get(Fields.DISAGG.value, False)
+
+        seq_len_configs = val[Fields.SEQ_LEN_CONFIGS.value]
+        image = val[Fields.IMAGE.value]
+        model = val[Fields.MODEL.value]
+        precision = val[Fields.PRECISION.value]
+        framework = val[Fields.FRAMEWORK.value]
+        runner = val[Fields.RUNNER.value]
+        model_code = val[Fields.MODEL_PREFIX.value]
+
+        for seq_config in seq_len_configs:
+            isl = seq_config[Fields.ISL.value]
+            osl = seq_config[Fields.OSL.value]
+
+            # Filter by sequence lengths if specified
+            if seq_lens_filter and (isl, osl) not in seq_lens_filter:
+                continue
+
+            bmk_space = seq_config[Fields.SEARCH_SPACE.value]
+
+            for bmk in bmk_space:
+                if is_multinode:
+                    # Skip multinode configs when --single-node is specified
+                    if not args.multi_node:
+                        continue
+
+                    # Multinode configuration
+                    # spec_decoding defaults to "none" if not specified
+                    spec_decoding = bmk.get(Fields.SPEC_DECODING.value, "none")
+
+                    prefill = bmk[Fields.PREFILL.value]
+                    decode = bmk[Fields.DECODE.value]
+
+                    # Get concurrency values (can be list or range)
+                    conc_list = bmk.get(Fields.CONC_LIST.value)
+                    # If it's a list
+                    if conc_list:
+                        conc_values = conc_list
+                    # If it's a range
+                    else:
+                        conc_start = bmk[Fields.CONC_START.value]
+                        conc_end = bmk[Fields.CONC_END.value]
+                        conc_values = []
+                        conc = conc_start
+                        while conc <= conc_end:
+                            conc_values.append(conc)
+                            if conc == conc_end:
+                                break
+                            conc *= args.step_size
+                            if conc > conc_end:
+                                conc = conc_end
+
+                    # Apply max-conc filter if specified
+                    # If max_conc is less than all values, use max_conc directly (if valid)
+                    if args.max_conc is not None:
+                        filtered_conc = [c for c in conc_values if c <= args.max_conc]
+                        if not filtered_conc:
+                            # No existing values <= max_conc, so use max_conc directly if valid
+                            if args.max_conc > 0:
+                                conc_values = [args.max_conc]
+                            else:
+                                continue  # Skip if max_conc is not positive
+                        else:
+                            conc_values = filtered_conc
+
+                    # For multinode, create a single entry with conc as a list
+                    seq_len_str = seq_len_to_str(isl, osl)
+                    entry = {
+                        Fields.IMAGE.value: image,
+                        Fields.MODEL.value: model,
+                        Fields.MODEL_PREFIX.value: model_code,
+                        Fields.PRECISION.value: precision,
+                        Fields.FRAMEWORK.value: framework,
+                        Fields.RUNNER.value: runner,
+                        Fields.ISL.value: isl,
+                        Fields.OSL.value: osl,
+                        Fields.SPEC_DECODING.value: spec_decoding,
+                        Fields.PREFILL.value: prefill,
+                        Fields.DECODE.value: decode,
+                        Fields.CONC.value: conc_values,  # Pass the entire list for multinode
+                        Fields.MAX_MODEL_LEN.value: isl + osl + 200,
+                        Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}",
+                        Fields.DISAGG.value: disagg,
+                    }
+
+                    validate_matrix_entry(entry, is_multinode)
+                    matrix_values.append(entry)
+                elif args.single_node:
+                    # Single-node configuration
+                    tp = bmk[Fields.TP.value]
+                    conc_start = bmk[Fields.CONC_START.value]
+                    conc_end = bmk[Fields.CONC_END.value]
+                    ep = bmk.get(Fields.EP.value)
+                    dp_attn = bmk.get(Fields.DP_ATTN.value)
+                    spec_decoding = bmk.get(Fields.SPEC_DECODING.value, "none")
+
+                    # Apply max-tp filter if specified
+                    # If tp > max_tp, use max_tp instead of skipping (if valid)
+                    if args.max_tp is not None:
+                        if args.max_tp <= 0:
+                            continue  # Skip if max_tp is not positive
+                        if tp > args.max_tp:
+                            tp = args.max_tp
+
+                    # Apply max-ep filter if specified
+                    # If ep > max_ep, use max_ep instead of skipping (if valid)
+                    if args.max_ep is not None:
+                        if args.max_ep <= 0:
+                            continue  # Skip if max_ep is not positive
+                        if ep is not None and ep > args.max_ep:
+                            ep = args.max_ep
+
+                    # Apply max-conc filter if specified
+                    # If conc_start > max_conc, use max_conc as both start and end (if valid)
+                    if args.max_conc is not None:
+                        if args.max_conc <= 0:
+                            continue  # Skip if max_conc is not positive
+                        if conc_start > args.max_conc:
+                            conc_start = args.max_conc
+                            conc_end = args.max_conc
+                        else:
+                            conc_end = min(conc_end, args.max_conc)
+
+                    conc = conc_start
+                    while conc <= conc_end:
+                        seq_len_str = seq_len_to_str(isl, osl)
+                        entry = {
+                            Fields.IMAGE.value: image,
+                            Fields.MODEL.value: model,
+                            Fields.MODEL_PREFIX.value: model_code,
+                            Fields.PRECISION.value: precision,
+                            Fields.FRAMEWORK.value: framework,
+                            Fields.RUNNER.value: runner,
+                            Fields.ISL.value: isl,
+                            Fields.OSL.value: osl,
+                            Fields.TP.value: tp,
+                            Fields.CONC.value: conc,
+                            Fields.MAX_MODEL_LEN.value: isl + osl + 200,
+                            Fields.EP.value: 1,  # Default
+                            Fields.DP_ATTN.value: False,  # Default
+                            Fields.SPEC_DECODING.value: spec_decoding,
+                            Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}",
+                            Fields.DISAGG.value: disagg,
+                        }
+
+                        if ep is not None:
+                            entry[Fields.EP.value] = ep
+                        if dp_attn is not None:
+                            entry[Fields.DP_ATTN.value] = dp_attn
+
+                        validate_matrix_entry(entry, is_multinode)
+                        matrix_values.append(entry)
+
+                        if conc == conc_end:
+                            break
+                        conc *= args.step_size
+                        if conc > conc_end:
+                            conc = conc_end
+
+    return matrix_values
+
+
+def generate_runner_model_sweep_config(args, all_config_data, runner_data):
+    """Generate runner-model sweep configurations.
+
+    Assumes all_config_data has been validated by validate_config_structure().
+    Supports both single-node and multinode configurations.
+    """
+    runner_nodes = runner_data.get(args.runner_type)
+
+    if not runner_nodes:
+        raise ValueError(
+            f"Runner '{args.runner_type}' does not exist in runner config '{args.runner_config}'. Must choose from existing runner types: '{', '.join(runner_data.keys())}'.")
+
+    # Filter runner nodes if filter is specified
+    if args.runner_node_filter:
+        runner_nodes = [
+            node for node in runner_nodes if args.runner_node_filter in node]
+        if not runner_nodes:
+            raise ValueError(
+                f"No runner nodes found matching filter '{args.runner_node_filter}' for runner type '{args.runner_type}'.")
+
+    matrix_values = []
+    for key, val in all_config_data.items():
+        # Only consider configs with specified runner
+        if val[Fields.RUNNER.value] != args.runner_type:
+            continue
+
+        is_multinode = val.get(Fields.MULTINODE.value, False)
+
+        # Skip configs that don't match the requested node type
+        if args.single_node and is_multinode:
+            continue
+        if args.multi_node and not is_multinode:
+            continue
+
+        # Get model code for exp_name
+        model_code = val[Fields.MODEL_PREFIX.value]
+        # Get disagg value, defaulting to False if not specified
+        disagg = val.get(Fields.DISAGG.value, False)
+
+        # Find 1k1k config
+        target_config = None
+        for config in val[Fields.SEQ_LEN_CONFIGS.value]:
+            if config[Fields.ISL.value] == 1024 and config[Fields.OSL.value] == 1024:
+                target_config = config
+                break
+
+        if target_config is None:
+            continue
+
+        if is_multinode:
+            # For multinode, find the search space entry with the lowest concurrency
+            def get_lowest_conc(search_space_entry):
+                conc_list = search_space_entry.get(Fields.CONC_LIST.value, [])
+                return min(conc_list) if conc_list else float('inf')
+
+            lowest_conc_entry = min(
+                target_config[Fields.SEARCH_SPACE.value], key=get_lowest_conc)
+
+            conc_list = lowest_conc_entry.get(Fields.CONC_LIST.value, [])
+            lowest_conc = min(conc_list) if conc_list else 1
+
+            spec_decoding = lowest_conc_entry.get(
+                Fields.SPEC_DECODING.value, "none")
+            prefill_config = lowest_conc_entry[Fields.PREFILL.value]
+            decode_config = lowest_conc_entry[Fields.DECODE.value]
+
+            for node in runner_nodes:
+                entry = {
+                    Fields.IMAGE.value: val[Fields.IMAGE.value],
+                    Fields.MODEL.value: val[Fields.MODEL.value],
+                    Fields.MODEL_PREFIX.value: model_code,
+                    Fields.PRECISION.value: val[Fields.PRECISION.value],
+                    Fields.FRAMEWORK.value: val[Fields.FRAMEWORK.value],
+                    Fields.RUNNER.value: node,
+                    Fields.ISL.value: 1024,
+                    Fields.OSL.value: 1024,
+                    Fields.SPEC_DECODING.value: spec_decoding,
+                    Fields.PREFILL.value: {
+                        Fields.NUM_WORKER.value: prefill_config[Fields.NUM_WORKER.value],
+                        Fields.TP.value: prefill_config[Fields.TP.value],
+                        Fields.EP.value: prefill_config[Fields.EP.value],
+                        Fields.DP_ATTN.value: prefill_config[Fields.DP_ATTN.value],
+                        Fields.ADDITIONAL_SETTINGS.value: prefill_config.get(Fields.ADDITIONAL_SETTINGS.value, []),
+                    },
+                    Fields.DECODE.value: {
+                        Fields.NUM_WORKER.value: decode_config[Fields.NUM_WORKER.value],
+                        Fields.TP.value: decode_config[Fields.TP.value],
+                        Fields.EP.value: decode_config[Fields.EP.value],
+                        Fields.DP_ATTN.value: decode_config[Fields.DP_ATTN.value],
+                        Fields.ADDITIONAL_SETTINGS.value: decode_config.get(Fields.ADDITIONAL_SETTINGS.value, []),
+                    },
+                    Fields.CONC.value: [lowest_conc],
+                    Fields.MAX_MODEL_LEN.value: 2048,
+                    Fields.EXP_NAME.value: f"{model_code}_test",
+                    Fields.DISAGG.value: disagg,
+                }
+                matrix_values.append(validate_matrix_entry(entry, is_multinode=True))
+        else:
+            # Single-node: pick highest TP config with lowest concurrency
+            highest_tp_bmk = max(
+                target_config[Fields.SEARCH_SPACE.value], key=lambda x: x[Fields.TP.value])
+            highest_tp = highest_tp_bmk[Fields.TP.value]
+            lowest_conc = highest_tp_bmk[Fields.CONC_START.value]
+
+            ep = highest_tp_bmk.get(Fields.EP.value)
+            dp_attn = highest_tp_bmk.get(Fields.DP_ATTN.value)
+            spec_decoding = highest_tp_bmk.get(Fields.SPEC_DECODING.value, "none")
+
+            for node in runner_nodes:
+                entry = {
+                    Fields.IMAGE.value: val[Fields.IMAGE.value],
+                    Fields.MODEL.value: val[Fields.MODEL.value],
+                    Fields.MODEL_PREFIX.value: model_code,
+                    Fields.PRECISION.value: val[Fields.PRECISION.value],
+                    Fields.FRAMEWORK.value: val[Fields.FRAMEWORK.value],
+                    Fields.RUNNER.value: node,
+                    Fields.ISL.value: 1024,
+                    Fields.OSL.value: 1024,
+                    Fields.TP.value: highest_tp,
+                    Fields.EP.value: ep if ep is not None else 1,
+                    Fields.DP_ATTN.value: dp_attn if dp_attn is not None else False,
+                    Fields.SPEC_DECODING.value: spec_decoding,
+                    Fields.CONC.value: lowest_conc,
+                    Fields.MAX_MODEL_LEN.value: 2048,
+                    Fields.EXP_NAME.value: f"{model_code}_test",
+                    Fields.DISAGG.value: disagg,
+                }
+                matrix_values.append(validate_matrix_entry(entry, is_multinode=False))
+
+    return matrix_values
+
+
+def generate_test_config_sweep(args, all_config_data):
+    """Generate full sweep for specific config keys.
+
+    Validates that all specified config keys exist before generating.
+    Expands all configs fully without any filtering.
+    """
+    # Validate all config keys exist
+    missing_keys = [key for key in args.config_keys if key not in all_config_data]
+    if missing_keys:
+        available_keys = sorted(all_config_data.keys())
+        raise ValueError(
+            f"Config key(s) not found: {', '.join(missing_keys)}.\n"
+            f"Available keys: {', '.join(available_keys)}"
+        )
+
+    matrix_values = []
+
+    for key in args.config_keys:
+        val = all_config_data[key]
+        is_multinode = val.get(Fields.MULTINODE.value, False)
+
+        image = val[Fields.IMAGE.value]
+        model = val[Fields.MODEL.value]
+        model_code = val[Fields.MODEL_PREFIX.value]
+        precision = val[Fields.PRECISION.value]
+        framework = val[Fields.FRAMEWORK.value]
+        runner = val[Fields.RUNNER.value]
+        disagg = val.get(Fields.DISAGG.value, False)
+
+        for seq_len_config in val[Fields.SEQ_LEN_CONFIGS.value]:
+            isl = seq_len_config[Fields.ISL.value]
+            osl = seq_len_config[Fields.OSL.value]
+            seq_len_str = seq_len_to_str(isl, osl)
+
+            for bmk in seq_len_config[Fields.SEARCH_SPACE.value]:
+                if is_multinode:
+                    # Multinode config
+                    spec_decoding = bmk.get(Fields.SPEC_DECODING.value, "none")
+                    prefill = bmk[Fields.PREFILL.value]
+                    decode = bmk[Fields.DECODE.value]
+
+                    # Get concurrency values
+                    if Fields.CONC_LIST.value in bmk:
+                        conc_values = bmk[Fields.CONC_LIST.value]
+                    else:
+                        conc_start = bmk[Fields.CONC_START.value]
+                        conc_end = bmk[Fields.CONC_END.value]
+                        conc_values = []
+                        conc = conc_start
+                        while conc <= conc_end:
+                            conc_values.append(conc)
+                            if conc == conc_end:
+                                break
+                            conc *= 2
+                            if conc > conc_end:
+                                conc = conc_end
+
+                    entry = {
+                        Fields.IMAGE.value: image,
+                        Fields.MODEL.value: model,
+                        Fields.MODEL_PREFIX.value: model_code,
+                        Fields.PRECISION.value: precision,
+                        Fields.FRAMEWORK.value: framework,
+                        Fields.RUNNER.value: runner,
+                        Fields.ISL.value: isl,
+                        Fields.OSL.value: osl,
+                        Fields.SPEC_DECODING.value: spec_decoding,
+                        Fields.PREFILL.value: prefill,
+                        Fields.DECODE.value: decode,
+                        Fields.CONC.value: conc_values,
+                        Fields.MAX_MODEL_LEN.value: isl + osl + 200,
+                        Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}",
+                        Fields.DISAGG.value: disagg,
+                    }
+                    matrix_values.append(validate_matrix_entry(entry, is_multinode=True))
+                else:
+                    # Single-node config
+                    tp = bmk[Fields.TP.value]
+                    ep = bmk.get(Fields.EP.value)
+                    dp_attn = bmk.get(Fields.DP_ATTN.value)
+                    spec_decoding = bmk.get(Fields.SPEC_DECODING.value, "none")
+
+                    # Get concurrency values
+                    if Fields.CONC_LIST.value in bmk:
+                        conc_values = bmk[Fields.CONC_LIST.value]
+                    else:
+                        conc_start = bmk[Fields.CONC_START.value]
+                        conc_end = bmk[Fields.CONC_END.value]
+                        conc_values = []
+                        conc = conc_start
+                        while conc <= conc_end:
+                            conc_values.append(conc)
+                            if conc == conc_end:
+                                break
+                            conc *= 2
+                            if conc > conc_end:
+                                conc = conc_end
+
+                    for conc in conc_values:
+                        entry = {
+                            Fields.IMAGE.value: image,
+                            Fields.MODEL.value: model,
+                            Fields.MODEL_PREFIX.value: model_code,
+                            Fields.PRECISION.value: precision,
+                            Fields.FRAMEWORK.value: framework,
+                            Fields.RUNNER.value: runner,
+                            Fields.ISL.value: isl,
+                            Fields.OSL.value: osl,
+                            Fields.TP.value: tp,
+                            Fields.CONC.value: conc,
+                            Fields.MAX_MODEL_LEN.value: isl + osl + 200,
+                            Fields.EP.value: ep if ep is not None else 1,
+                            Fields.DP_ATTN.value: dp_attn if dp_attn is not None else False,
+                            Fields.SPEC_DECODING.value: spec_decoding,
+                            Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}",
+                            Fields.DISAGG.value: disagg,
+                        }
+                        matrix_values.append(validate_matrix_entry(entry, is_multinode=False))
+
+    return matrix_values
+
+
+def main():
+    # Create parent parser with common arguments
+    parent_parser = argparse.ArgumentParser(add_help=False)
+    parent_parser.add_argument(
+        '--config-files',
+        nargs='+',
+        required=True,
+        help='One or more configuration files (YAML format)'
+    )
+    parent_parser.add_argument(
+        '--runner-config',
+        required=True,
+        help='Configuration file holding runner information (YAML format)'
+    )
+    parent_parser.add_argument(
+        '--run-evals',
+        action='store_true',
+        required=False,
+        help='When specifiedm run evals on a subset of configs.'
+    )
+
+    # Create main parser
+    parser = argparse.ArgumentParser(
+        description='Generate benchmark configurations from YAML config files'
+    )
+
+    # Create subparsers for subcommands
+    subparsers = parser.add_subparsers(
+        dest='command',
+        required=True,
+        help='Available commands'
+    )
+
+    # Subcommand: full-sweep
+    full_sweep_parser = subparsers.add_parser(
+        'full-sweep',
+        parents=[parent_parser],
+        add_help=False,
+        help='Generate full sweep configurations with optional filtering by model, precision, framework, runner type, and sequence lengths'
+    )
+    full_sweep_parser.add_argument(
+        '--model-prefix',
+        nargs='+',
+        required=False,
+        help='Model prefix(es) to filter configurations (optional, can specify multiple)'
+    )
+    full_sweep_parser.add_argument(
+        '--precision',
+        nargs='+',
+        required=False,
+        help='Precision(s) to filter by (e.g., fp4, fp8) (optional, can specify multiple)'
+    )
+    full_sweep_parser.add_argument(
+        '--framework',
+        nargs='+',
+        required=False,
+        help='Framework(s) to filter by (e.g., vllm, trt, sglang) (optional, can specify multiple)'
+    )
+    full_sweep_parser.add_argument(
+        '--runner-type',
+        nargs='+',
+        required=False,
+        help='Runner type(s) to filter by (e.g., h200, h100) (optional, can specify multiple)'
+    )
+    full_sweep_parser.add_argument(
+        '--seq-lens',
+        nargs='+',
+        choices=list(seq_len_stoi.keys()),
+        required=False,
+        help=f"Sequence length configurations to include: {', '.join(seq_len_stoi.keys())}. If not specified, all sequence lengths are included."
+    )
+    full_sweep_parser.add_argument(
+        '--step-size',
+        type=int,
+        default=2,
+        help='Step size for concurrency values (default: 2)'
+    )
+    full_sweep_parser.add_argument(
+        '--max-conc',
+        type=int,
+        required=False,
+        help='Maximum concurrency value to include (filters out higher concurrency values)'
+    )
+    full_sweep_parser.add_argument(
+        '--max-tp',
+        type=int,
+        required=False,
+        help='Maximum tensor parallelism value to include (single-node only)'
+    )
+    full_sweep_parser.add_argument(
+        '--max-ep',
+        type=int,
+        required=False,
+        help='Maximum expert parallelism value to include (single-node only)'
+    )
+    node_type_group = full_sweep_parser.add_mutually_exclusive_group(required=True)
+    node_type_group.add_argument(
+        '--single-node',
+        action='store_true',
+        help='Only generate single-node configurations'
+    )
+    node_type_group.add_argument(
+        '--multi-node',
+        action='store_true',
+        help='Only generate multi-node configurations'
+    )
+    full_sweep_parser.add_argument(
+        '-h', '--help',
+        action='help',
+        help='Show this help message and exit'
+    )
+
+    # Subcommand: runner-model-sweep
+    test_config_parser = subparsers.add_parser(
+        'runner-model-sweep',
+        parents=[parent_parser],
+        add_help=False,
+        help='Given a runner type, find all configurations matching the type, and run that configuration on all individual runner nodes for the specified runner type. This is meant to validate that all runner nodes work on all configurations for a runner type. For instance, to validate that all configs that specify an h200 runner successfully run across all h200 runner nodes.'
+    )
+    test_config_parser.add_argument(
+        '--runner-type',
+        required=True,
+        help='Runner type (e.g., b200-trt, h100)'
+    )
+    test_config_parser.add_argument(
+        '--runner-node-filter',
+        required=False,
+        help='Filter runner nodes by substring match (e.g., "mi300x-amd" to only include nodes containing that string)'
+    )
+    test_node_group = test_config_parser.add_mutually_exclusive_group(
+        required=True)
+    test_node_group.add_argument(
+        '--single-node',
+        action='store_true',
+        help='Generate single-node configurations only'
+    )
+    test_node_group.add_argument(
+        '--multi-node',
+        action='store_true',
+        help='Generate multi-node configurations only'
+    )
+    test_config_parser.add_argument(
+        '-h', '--help',
+        action='help',
+        help='Show this help message and exit'
+    )
+
+    # Subcommand: test-config
+    test_config_keys_parser = subparsers.add_parser(
+        'test-config',
+        parents=[parent_parser],
+        add_help=False,
+        help='Generate full sweep for specific config keys. Validates that all specified keys exist before generating.'
+    )
+    test_config_keys_parser.add_argument(
+        '--config-keys',
+        nargs='+',
+        required=True,
+        help='One or more config keys to generate sweep for (e.g., dsr1-fp4-b200-sglang dsr1-fp8-h200-trt)'
+    )
+    test_config_keys_parser.add_argument(
+        '-h', '--help',
+        action='help',
+        help='Show this help message and exit'
+    )
+
+    args = parser.parse_args()
+
+    # Load and validate configuration files (validation happens by default in load functions)
+    all_config_data = load_config_files(args.config_files)
+    runner_data = load_runner_file(args.runner_config)
+
+    # Route to appropriate function based on subcommand
+    if args.command == 'full-sweep':
+        matrix_values = generate_full_sweep(args, all_config_data, runner_data)
+    elif args.command == 'runner-model-sweep':
+        matrix_values = generate_runner_model_sweep_config(
+            args, all_config_data, runner_data)
+    elif args.command == 'test-config':
+        matrix_values = generate_test_config_sweep(args, all_config_data)
+    else:
+        parser.error(f"Unknown command: {args.command}")
+        
+    # Choose eval (opt-in via --run-evals)
+    if args.run_evals:
+        matrix_values = mark_eval_entries(matrix_values)
+
+    print(json.dumps(matrix_values))
+    return matrix_values
+
+
+if __name__ == "__main__":
+    main()
diff --git a/utils/matrix-logic/pytest.ini b/utils/matrix_logic/pytest.ini
similarity index 100%
rename from utils/matrix-logic/pytest.ini
rename to utils/matrix_logic/pytest.ini
diff --git a/utils/matrix_logic/test_generate_sweep_configs.py b/utils/matrix_logic/test_generate_sweep_configs.py
new file mode 100644
index 000000000..c505611c3
--- /dev/null
+++ b/utils/matrix_logic/test_generate_sweep_configs.py
@@ -0,0 +1,948 @@
+"""Comprehensive tests for generate_sweep_configs.py"""
+import pytest
+import argparse
+from generate_sweep_configs import (
+    seq_len_stoi,
+    seq_len_itos,
+    seq_len_to_str,
+    generate_full_sweep,
+    generate_runner_model_sweep_config,
+)
+
+
+# =============================================================================
+# Test Fixtures
+# =============================================================================
+
+@pytest.fixture
+def sample_single_node_config():
+    """Single node config based on dsr1-fp8-mi300x-sglang."""
+    return {
+        "dsr1-fp8-mi300x-sglang": {
+            "image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915",
+            "model": "deepseek-ai/DeepSeek-R1-0528",
+            "model-prefix": "dsr1",
+            "precision": "fp8",
+            "framework": "sglang",
+            "runner": "mi300x",
+            "multinode": False,
+            "seq-len-configs": [
+                {
+                    "isl": 1024,
+                    "osl": 1024,
+                    "search-space": [
+                        {"tp": 8, "conc-start": 4, "conc-end": 64}
+                    ]
+                },
+                {
+                    "isl": 8192,
+                    "osl": 1024,
+                    "search-space": [
+                        {"tp": 8, "conc-start": 4, "conc-end": 64}
+                    ]
+                }
+            ]
+        }
+    }
+
+
+@pytest.fixture
+def sample_multinode_config():
+    """Multinode config based on dsr1-fp4-gb200-dynamo-trt."""
+    return {
+        "dsr1-fp4-gb200-dynamo-trt": {
+            "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3",
+            "model": "deepseek-r1-fp4",
+            "model-prefix": "dsr1",
+            "precision": "fp4",
+            "framework": "dynamo-trt",
+            "runner": "gb200",
+            "multinode": True,
+            "disagg": True,
+            "seq-len-configs": [
+                {
+                    "isl": 1024,
+                    "osl": 1024,
+                    "search-space": [
+                        {
+                            "conc-list": [2150],
+                            "prefill": {
+                                "num-worker": 5,
+                                "tp": 4,
+                                "ep": 4,
+                                "dp-attn": True,
+                                "additional-settings": [
+                                    "PREFILL_MAX_NUM_TOKENS=8448",
+                                    "PREFILL_MAX_BATCH_SIZE=1",
+                                ],
+                            },
+                            "decode": {
+                                "num-worker": 1,
+                                "tp": 8,
+                                "ep": 8,
+                                "dp-attn": True,
+                                "additional-settings": [
+                                    "DECODE_MAX_NUM_TOKENS=256",
+                                    "DECODE_MAX_BATCH_SIZE=256",
+                                ],
+                            },
+                        }
+                    ]
+                }
+            ]
+        }
+    }
+
+
+@pytest.fixture
+def sample_runner_config():
+    """Runner config based on .github/configs/runners.yaml."""
+    return {
+        "h100": ["h100-cr_0", "h100-cr_1", "h100-cw_0", "h100-cw_1"],
+        "h200": ["h200-cw_0", "h200-cw_1", "h200-nb_0", "h200-nb_1"],
+        "b200": ["b200-nvd_0", "b200-nvd_1", "b200-dgxc_1"],
+        "mi300x": ["mi300x-amd_0", "mi300x-amd_1", "mi300x-cr_0"],
+        "gb200": ["gb200-nv_0"],
+    }
+
+
+@pytest.fixture
+def full_sweep_args_single_node():
+    """Args for full-sweep single-node command."""
+    args = argparse.Namespace()
+    args.model_prefix = None
+    args.precision = None
+    args.framework = None
+    args.runner_type = None
+    args.seq_lens = None
+    args.step_size = 2
+    args.max_conc = None
+    args.max_tp = None
+    args.max_ep = None
+    args.single_node = True
+    args.multi_node = False
+    return args
+
+
+@pytest.fixture
+def full_sweep_args_multi_node():
+    """Args for full-sweep multi-node command."""
+    args = argparse.Namespace()
+    args.model_prefix = None
+    args.precision = None
+    args.framework = None
+    args.runner_type = None
+    args.seq_lens = None
+    args.step_size = 2
+    args.max_conc = None
+    args.max_tp = None
+    args.max_ep = None
+    args.single_node = False
+    args.multi_node = True
+    return args
+
+
+# =============================================================================
+# Test seq_len mappings
+# =============================================================================
+
+class TestSeqLenMappings:
+    """Tests for sequence length string mappings."""
+
+    def test_seq_len_stoi_values(self):
+        """Verify seq_len_stoi has expected mappings."""
+        assert seq_len_stoi["1k1k"] == (1024, 1024)
+        assert seq_len_stoi["1k8k"] == (1024, 8192)
+        assert seq_len_stoi["8k1k"] == (8192, 1024)
+
+    def test_seq_len_itos_reverse_mapping(self):
+        """Verify seq_len_itos is reverse of stoi."""
+        assert seq_len_itos[(1024, 1024)] == "1k1k"
+        assert seq_len_itos[(1024, 8192)] == "1k8k"
+        assert seq_len_itos[(8192, 1024)] == "8k1k"
+
+
+class TestSeqLenToStr:
+    """Tests for seq_len_to_str function."""
+
+    def test_known_sequence_lengths(self):
+        """Known sequence lengths should return short name."""
+        assert seq_len_to_str(1024, 1024) == "1k1k"
+        assert seq_len_to_str(1024, 8192) == "1k8k"
+        assert seq_len_to_str(8192, 1024) == "8k1k"
+
+    def test_unknown_sequence_lengths(self):
+        """Unknown sequence lengths should return isl_osl format."""
+        assert seq_len_to_str(2048, 2048) == "2048_2048"
+        assert seq_len_to_str(4096, 1024) == "4096_1024"
+
+
+# =============================================================================
+# Test generate_full_sweep for single-node
+# =============================================================================
+
+class TestGenerateFullSweepSingleNode:
+    """Tests for generate_full_sweep with single-node configs."""
+
+    def test_basic_sweep_generation(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node):
+        """Basic single-node sweep should generate entries."""
+        result = generate_full_sweep(
+            full_sweep_args_single_node,
+            sample_single_node_config,
+            sample_runner_config
+        )
+        assert len(result) > 0
+        # With step_size=2, conc goes 4, 8, 16, 32, 64 = 5 values per seq-len config
+        # 2 seq-len configs * 5 = 10 entries
+        assert len(result) == 10
+
+    def test_matrix_entry_structure(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node):
+        """Generated entries should have correct structure."""
+        result = generate_full_sweep(
+            full_sweep_args_single_node,
+            sample_single_node_config,
+            sample_runner_config
+        )
+        entry = result[0]
+        assert entry["image"] == "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915"
+        assert entry["model"] == "deepseek-ai/DeepSeek-R1-0528"
+        assert entry["precision"] == "fp8"
+        assert entry["framework"] == "sglang"
+        assert entry["runner"] == "mi300x"
+        assert entry["tp"] == 8
+        assert "exp-name" in entry
+        assert "max-model-len" in entry
+
+    def test_filter_by_model_prefix(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node):
+        """Filter by model prefix should work."""
+        full_sweep_args_single_node.model_prefix = ["dsr1"]
+        result = generate_full_sweep(
+            full_sweep_args_single_node,
+            sample_single_node_config,
+            sample_runner_config
+        )
+        assert len(result) > 0
+
+        # Non-matching prefix should return empty
+        full_sweep_args_single_node.model_prefix = ["nonexistent"]
+        result = generate_full_sweep(
+            full_sweep_args_single_node,
+            sample_single_node_config,
+            sample_runner_config
+        )
+        assert len(result) == 0
+
+    def test_filter_by_precision(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node):
+        """Filter by precision should work."""
+        full_sweep_args_single_node.precision = ["fp8"]
+        result = generate_full_sweep(
+            full_sweep_args_single_node,
+            sample_single_node_config,
+            sample_runner_config
+        )
+        assert len(result) > 0
+
+        full_sweep_args_single_node.precision = ["fp4"]
+        result = generate_full_sweep(
+            full_sweep_args_single_node,
+            sample_single_node_config,
+            sample_runner_config
+        )
+        assert len(result) == 0
+
+    def test_filter_by_framework(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node):
+        """Filter by framework should work."""
+        full_sweep_args_single_node.framework = ["sglang"]
+        result = generate_full_sweep(
+            full_sweep_args_single_node,
+            sample_single_node_config,
+            sample_runner_config
+        )
+        assert len(result) > 0
+
+        full_sweep_args_single_node.framework = ["vllm"]
+        result = generate_full_sweep(
+            full_sweep_args_single_node,
+            sample_single_node_config,
+            sample_runner_config
+        )
+        assert len(result) == 0
+
+    def test_filter_by_runner_type(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node):
+        """Filter by runner type should work."""
+        full_sweep_args_single_node.runner_type = ["mi300x"]
+        result = generate_full_sweep(
+            full_sweep_args_single_node,
+            sample_single_node_config,
+            sample_runner_config
+        )
+        assert len(result) > 0
+
+        full_sweep_args_single_node.runner_type = ["h100"]
+        result = generate_full_sweep(
+            full_sweep_args_single_node,
+            sample_single_node_config,
+            sample_runner_config
+        )
+        assert len(result) == 0
+
+    def test_invalid_runner_type_raises_error(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node):
+        """Invalid runner type should raise ValueError."""
+        full_sweep_args_single_node.runner_type = ["invalid_runner"]
+        with pytest.raises(ValueError) as exc_info:
+            generate_full_sweep(
+                full_sweep_args_single_node,
+                sample_single_node_config,
+                sample_runner_config
+            )
+        assert "Invalid runner type" in str(exc_info.value)
+
+    def test_filter_by_seq_lens(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node):
+        """Filter by sequence lengths should work."""
+        full_sweep_args_single_node.seq_lens = ["1k1k"]
+        result = generate_full_sweep(
+            full_sweep_args_single_node,
+            sample_single_node_config,
+            sample_runner_config
+        )
+        # Only 1k1k entries, 5 concurrency values
+        assert len(result) == 5
+        assert all(entry["isl"] == 1024 and entry["osl"] == 1024 for entry in result)
+
+    def test_max_conc_filter(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node):
+        """max_conc filter should limit concurrency values."""
+        full_sweep_args_single_node.max_conc = 16
+        full_sweep_args_single_node.seq_lens = ["1k1k"]
+        result = generate_full_sweep(
+            full_sweep_args_single_node,
+            sample_single_node_config,
+            sample_runner_config
+        )
+        # conc values: 4, 8, 16 (32, 64 filtered out)
+        assert len(result) == 3
+        assert all(entry["conc"] <= 16 for entry in result)
+
+    def test_max_conc_creates_config_when_below_min(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node):
+        """max_conc below config's min should create config with max_conc value."""
+        # Config has conc-start=4, so max_conc=1 should create entry with conc=1
+        full_sweep_args_single_node.max_conc = 1
+        full_sweep_args_single_node.seq_lens = ["1k1k"]
+        result = generate_full_sweep(
+            full_sweep_args_single_node,
+            sample_single_node_config,
+            sample_runner_config
+        )
+        # Should create 1 entry with conc=1
+        assert len(result) == 1
+        assert result[0]["conc"] == 1
+
+    def test_max_conc_zero_or_negative_skips(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node):
+        """max_conc of 0 or negative should skip configs."""
+        for invalid_value in [0, -1, -100]:
+            full_sweep_args_single_node.max_conc = invalid_value
+            result = generate_full_sweep(
+                full_sweep_args_single_node,
+                sample_single_node_config,
+                sample_runner_config
+            )
+            assert len(result) == 0, f"Expected 0 results for max_conc={invalid_value}"
+
+    def test_max_tp_filter(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node):
+        """max_tp filter should use max_tp when config tp exceeds it."""
+        full_sweep_args_single_node.max_tp = 4
+        full_sweep_args_single_node.seq_lens = ["1k1k"]
+        result = generate_full_sweep(
+            full_sweep_args_single_node,
+            sample_single_node_config,
+            sample_runner_config
+        )
+        # tp=8 in config, but max_tp=4, so should use tp=4
+        assert len(result) > 0
+        assert all(entry["tp"] == 4 for entry in result)
+
+    def test_max_tp_creates_config_when_below_min(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node):
+        """max_tp below config's tp should create config with max_tp value."""
+        # Config has tp=8, so max_tp=2 should create entries with tp=2
+        full_sweep_args_single_node.max_tp = 2
+        full_sweep_args_single_node.seq_lens = ["1k1k"]
+        result = generate_full_sweep(
+            full_sweep_args_single_node,
+            sample_single_node_config,
+            sample_runner_config
+        )
+        assert len(result) > 0
+        assert all(entry["tp"] == 2 for entry in result)
+
+    def test_max_tp_zero_or_negative_skips(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node):
+        """max_tp of 0 or negative should skip configs."""
+        for invalid_value in [0, -1, -100]:
+            full_sweep_args_single_node.max_tp = invalid_value
+            result = generate_full_sweep(
+                full_sweep_args_single_node,
+                sample_single_node_config,
+                sample_runner_config
+            )
+            assert len(result) == 0, f"Expected 0 results for max_tp={invalid_value}"
+
+    def test_step_size(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node):
+        """Different step sizes should affect concurrency progression."""
+        full_sweep_args_single_node.step_size = 4
+        full_sweep_args_single_node.seq_lens = ["1k1k"]
+        result = generate_full_sweep(
+            full_sweep_args_single_node,
+            sample_single_node_config,
+            sample_runner_config
+        )
+        # conc: 4, 16, 64 = 3 values
+        assert len(result) == 3
+        conc_values = [entry["conc"] for entry in result]
+        assert 4 in conc_values
+        assert 16 in conc_values
+        assert 64 in conc_values
+
+    def test_exp_name_format(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node):
+        """exp-name should have correct format."""
+        full_sweep_args_single_node.seq_lens = ["1k1k"]
+        result = generate_full_sweep(
+            full_sweep_args_single_node,
+            sample_single_node_config,
+            sample_runner_config
+        )
+        assert all(entry["exp-name"] == "dsr1_1k1k" for entry in result)
+
+    def test_max_model_len_calculation(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node):
+        """max-model-len should be isl + osl + 200."""
+        result = generate_full_sweep(
+            full_sweep_args_single_node,
+            sample_single_node_config,
+            sample_runner_config
+        )
+        for entry in result:
+            expected_max_model_len = entry["isl"] + entry["osl"] + 200
+            assert entry["max-model-len"] == expected_max_model_len
+
+
+# =============================================================================
+# Test generate_full_sweep for multi-node
+# =============================================================================
+
+class TestGenerateFullSweepMultiNode:
+    """Tests for generate_full_sweep with multi-node configs."""
+
+    def test_multinode_sweep_generation(self, sample_multinode_config, sample_runner_config, full_sweep_args_multi_node):
+        """Multinode sweep should generate entries with prefill/decode."""
+        result = generate_full_sweep(
+            full_sweep_args_multi_node,
+            sample_multinode_config,
+            sample_runner_config
+        )
+        assert len(result) == 1  # One entry with conc-list
+
+    def test_multinode_entry_structure(self, sample_multinode_config, sample_runner_config, full_sweep_args_multi_node):
+        """Multinode entries should have prefill and decode configs."""
+        result = generate_full_sweep(
+            full_sweep_args_multi_node,
+            sample_multinode_config,
+            sample_runner_config
+        )
+        entry = result[0]
+        assert "prefill" in entry
+        assert "decode" in entry
+        assert entry["prefill"]["num-worker"] == 5
+        assert entry["decode"]["num-worker"] == 1
+        assert entry["disagg"] is True
+
+    def test_multinode_conc_as_list(self, sample_multinode_config, sample_runner_config, full_sweep_args_multi_node):
+        """Multinode conc should be passed as list."""
+        result = generate_full_sweep(
+            full_sweep_args_multi_node,
+            sample_multinode_config,
+            sample_runner_config
+        )
+        entry = result[0]
+        assert isinstance(entry["conc"], list)
+        assert entry["conc"] == [2150]
+
+    def test_single_node_flag_skips_multinode(self, sample_multinode_config, sample_runner_config, full_sweep_args_single_node):
+        """Single-node flag should skip multinode configs."""
+        result = generate_full_sweep(
+            full_sweep_args_single_node,
+            sample_multinode_config,
+            sample_runner_config
+        )
+        assert len(result) == 0
+
+
+# =============================================================================
+# Test generate_runner_model_sweep_config
+# =============================================================================
+
+class TestGenerateRunnerModelSweepConfig:
+    """Tests for generate_runner_model_sweep_config function."""
+
+    @pytest.fixture
+    def runner_sweep_args(self):
+        """Args for runner-model-sweep command (single-node)."""
+        args = argparse.Namespace()
+        args.runner_type = "mi300x"
+        args.runner_config = "runners.yaml"
+        args.runner_node_filter = None
+        args.single_node = True
+        args.multi_node = False
+        return args
+
+    def test_basic_runner_sweep(self, sample_single_node_config, sample_runner_config, runner_sweep_args):
+        """Basic runner sweep should generate entries for each node."""
+        result = generate_runner_model_sweep_config(
+            runner_sweep_args,
+            sample_single_node_config,
+            sample_runner_config
+        )
+        # 3 mi300x nodes
+        assert len(result) == 3
+
+    def test_runner_sweep_entry_structure(self, sample_single_node_config, sample_runner_config, runner_sweep_args):
+        """Runner sweep entries should use 1k1k config."""
+        result = generate_runner_model_sweep_config(
+            runner_sweep_args,
+            sample_single_node_config,
+            sample_runner_config
+        )
+        for entry in result:
+            assert entry["isl"] == 1024
+            assert entry["osl"] == 1024
+            assert entry["max-model-len"] == 2048
+            assert "_test" in entry["exp-name"]
+
+    def test_each_node_gets_entry(self, sample_single_node_config, sample_runner_config, runner_sweep_args):
+        """Each runner node should get its own entry."""
+        result = generate_runner_model_sweep_config(
+            runner_sweep_args,
+            sample_single_node_config,
+            sample_runner_config
+        )
+        runners = [entry["runner"] for entry in result]
+        assert "mi300x-amd_0" in runners
+        assert "mi300x-amd_1" in runners
+        assert "mi300x-cr_0" in runners
+
+    def test_invalid_runner_type(self, sample_single_node_config, sample_runner_config, runner_sweep_args):
+        """Invalid runner type should raise error."""
+        runner_sweep_args.runner_type = "nonexistent"
+        with pytest.raises(ValueError) as exc_info:
+            generate_runner_model_sweep_config(
+                runner_sweep_args,
+                sample_single_node_config,
+                sample_runner_config
+            )
+        assert "does not exist" in str(exc_info.value)
+
+    def test_runner_node_filter(self, sample_single_node_config, sample_runner_config, runner_sweep_args):
+        """Runner node filter should limit nodes."""
+        runner_sweep_args.runner_node_filter = "amd"
+        result = generate_runner_model_sweep_config(
+            runner_sweep_args,
+            sample_single_node_config,
+            sample_runner_config
+        )
+        # Only mi300x-amd_0 and mi300x-amd_1 match
+        assert len(result) == 2
+        assert all("amd" in entry["runner"] for entry in result)
+
+    def test_runner_node_filter_no_match(self, sample_single_node_config, sample_runner_config, runner_sweep_args):
+        """Runner node filter with no matches should raise error."""
+        runner_sweep_args.runner_node_filter = "nonexistent"
+        with pytest.raises(ValueError) as exc_info:
+            generate_runner_model_sweep_config(
+                runner_sweep_args,
+                sample_single_node_config,
+                sample_runner_config
+            )
+        assert "No runner nodes found" in str(exc_info.value)
+
+    def test_uses_highest_tp(self, sample_single_node_config, sample_runner_config, runner_sweep_args):
+        """Should use highest TP from search space."""
+        result = generate_runner_model_sweep_config(
+            runner_sweep_args,
+            sample_single_node_config,
+            sample_runner_config
+        )
+        # Config has tp=8
+        assert all(entry["tp"] == 8 for entry in result)
+
+    def test_uses_lowest_conc(self, sample_single_node_config, sample_runner_config, runner_sweep_args):
+        """Should use lowest concurrency from search space."""
+        result = generate_runner_model_sweep_config(
+            runner_sweep_args,
+            sample_single_node_config,
+            sample_runner_config
+        )
+        # Config has conc-start=4
+        assert all(entry["conc"] == 4 for entry in result)
+
+
+# =============================================================================
+# Test edge cases and special configurations
+# =============================================================================
+
+class TestEdgeCases:
+    """Tests for edge cases and special configurations."""
+
+    def test_config_with_ep_and_dp_attn(self, sample_runner_config, full_sweep_args_single_node):
+        """Config with ep and dp-attn should be handled correctly."""
+        config = {
+            "test-config": {
+                "image": "test-image",
+                "model": "test-model",
+                "model-prefix": "test",
+                "precision": "fp4",
+                "framework": "sglang",
+                "runner": "b200",
+                "multinode": False,
+                "seq-len-configs": [
+                    {
+                        "isl": 1024,
+                        "osl": 1024,
+                        "search-space": [
+                            {"tp": 4, "ep": 4, "dp-attn": True, "conc-start": 4, "conc-end": 4}
+                        ]
+                    }
+                ]
+            }
+        }
+        result = generate_full_sweep(
+            full_sweep_args_single_node,
+            config,
+            sample_runner_config
+        )
+        assert len(result) == 1
+        assert result[0]["ep"] == 4
+        assert result[0]["dp-attn"] is True
+
+    def test_config_with_spec_decoding(self, sample_runner_config, full_sweep_args_single_node):
+        """Config with spec-decoding should be handled correctly."""
+        config = {
+            "test-config": {
+                "image": "test-image",
+                "model": "test-model",
+                "model-prefix": "test",
+                "precision": "fp4",
+                "framework": "trt",
+                "runner": "b200",
+                "multinode": False,
+                "seq-len-configs": [
+                    {
+                        "isl": 1024,
+                        "osl": 1024,
+                        "search-space": [
+                            {"tp": 8, "spec-decoding": "mtp", "conc-start": 4, "conc-end": 4}
+                        ]
+                    }
+                ]
+            }
+        }
+        result = generate_full_sweep(
+            full_sweep_args_single_node,
+            config,
+            sample_runner_config
+        )
+        assert len(result) == 1
+        assert result[0]["spec-decoding"] == "mtp"
+
+    def test_conc_list_in_single_node(self, sample_runner_config, full_sweep_args_single_node):
+        """Single node config with conc-list should work."""
+        config = {
+            "test-config": {
+                "image": "test-image",
+                "model": "test-model",
+                "model-prefix": "test",
+                "precision": "fp8",
+                "framework": "sglang",
+                "runner": "mi300x",
+                "multinode": False,
+                "seq-len-configs": [
+                    {
+                        "isl": 1024,
+                        "osl": 1024,
+                        "search-space": [
+                            {"tp": 8, "conc-start": 4, "conc-end": 16}
+                        ]
+                    }
+                ]
+            }
+        }
+        result = generate_full_sweep(
+            full_sweep_args_single_node,
+            config,
+            sample_runner_config
+        )
+        conc_values = [entry["conc"] for entry in result]
+        assert 4 in conc_values
+        assert 8 in conc_values
+        assert 16 in conc_values
+
+    def test_disagg_defaults_to_false(self, sample_runner_config, full_sweep_args_single_node):
+        """disagg should default to False when not specified."""
+        config = {
+            "test-config": {
+                "image": "test-image",
+                "model": "test-model",
+                "model-prefix": "test",
+                "precision": "fp8",
+                "framework": "sglang",
+                "runner": "mi300x",
+                "multinode": False,
+                # No disagg field
+                "seq-len-configs": [
+                    {
+                        "isl": 1024,
+                        "osl": 1024,
+                        "search-space": [
+                            {"tp": 8, "conc-start": 4, "conc-end": 4}
+                        ]
+                    }
+                ]
+            }
+        }
+        result = generate_full_sweep(
+            full_sweep_args_single_node,
+            config,
+            sample_runner_config
+        )
+        assert result[0]["disagg"] is False
+
+    def test_multinode_conc_range_expansion(self, sample_runner_config, full_sweep_args_multi_node):
+        """Multinode with conc range should expand to list."""
+        config = {
+            "test-config": {
+                "image": "test-image",
+                "model": "test-model",
+                "model-prefix": "test",
+                "precision": "fp4",
+                "framework": "dynamo-trt",
+                "runner": "gb200",
+                "multinode": True,
+                "seq-len-configs": [
+                    {
+                        "isl": 1024,
+                        "osl": 1024,
+                        "search-space": [
+                            {
+                                "conc-start": 1,
+                                "conc-end": 8,
+                                "prefill": {
+                                    "num-worker": 1,
+                                    "tp": 4,
+                                    "ep": 4,
+                                    "dp-attn": False,
+                                },
+                                "decode": {
+                                    "num-worker": 1,
+                                    "tp": 8,
+                                    "ep": 8,
+                                    "dp-attn": False,
+                                },
+                            }
+                        ]
+                    }
+                ]
+            }
+        }
+        result = generate_full_sweep(
+            full_sweep_args_multi_node,
+            config,
+            sample_runner_config
+        )
+        assert len(result) == 1
+        # step_size=2: 1, 2, 4, 8
+        assert result[0]["conc"] == [1, 2, 4, 8]
+
+    def test_max_ep_creates_config_when_below_min(self, sample_runner_config, full_sweep_args_single_node):
+        """max_ep below config's ep should create config with max_ep value."""
+        config = {
+            "test-config": {
+                "image": "test-image",
+                "model": "test-model",
+                "model-prefix": "test",
+                "precision": "fp4",
+                "framework": "sglang",
+                "runner": "b200",
+                "multinode": False,
+                "seq-len-configs": [
+                    {
+                        "isl": 1024,
+                        "osl": 1024,
+                        "search-space": [
+                            {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 4}
+                        ]
+                    }
+                ]
+            }
+        }
+        full_sweep_args_single_node.max_ep = 2
+        result = generate_full_sweep(
+            full_sweep_args_single_node,
+            config,
+            sample_runner_config
+        )
+        # ep=8 in config, but max_ep=2, so should use ep=2
+        assert len(result) == 1
+        assert result[0]["ep"] == 2
+
+    def test_max_ep_zero_or_negative_skips(self, sample_runner_config, full_sweep_args_single_node):
+        """max_ep of 0 or negative should skip configs."""
+        config = {
+            "test-config": {
+                "image": "test-image",
+                "model": "test-model",
+                "model-prefix": "test",
+                "precision": "fp4",
+                "framework": "sglang",
+                "runner": "b200",
+                "multinode": False,
+                "seq-len-configs": [
+                    {
+                        "isl": 1024,
+                        "osl": 1024,
+                        "search-space": [
+                            {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 4}
+                        ]
+                    }
+                ]
+            }
+        }
+        for invalid_value in [0, -1, -100]:
+            full_sweep_args_single_node.max_ep = invalid_value
+            result = generate_full_sweep(
+                full_sweep_args_single_node,
+                config,
+                sample_runner_config
+            )
+            assert len(result) == 0, f"Expected 0 results for max_ep={invalid_value}"
+
+    def test_multinode_max_conc_zero_or_negative_skips(self, sample_runner_config, full_sweep_args_multi_node):
+        """Multinode max_conc of 0 or negative should skip configs."""
+        config = {
+            "test-config": {
+                "image": "test-image",
+                "model": "test-model",
+                "model-prefix": "test",
+                "precision": "fp4",
+                "framework": "dynamo-trt",
+                "runner": "gb200",
+                "multinode": True,
+                "seq-len-configs": [
+                    {
+                        "isl": 1024,
+                        "osl": 1024,
+                        "search-space": [
+                            {
+                                "conc-list": [100, 200, 400],
+                                "prefill": {
+                                    "num-worker": 1,
+                                    "tp": 4,
+                                    "ep": 4,
+                                    "dp-attn": False,
+                                },
+                                "decode": {
+                                    "num-worker": 1,
+                                    "tp": 8,
+                                    "ep": 8,
+                                    "dp-attn": False,
+                                },
+                            }
+                        ]
+                    }
+                ]
+            }
+        }
+        for invalid_value in [0, -1, -100]:
+            full_sweep_args_multi_node.max_conc = invalid_value
+            result = generate_full_sweep(
+                full_sweep_args_multi_node,
+                config,
+                sample_runner_config
+            )
+            assert len(result) == 0, f"Expected 0 results for max_conc={invalid_value}"
+
+    def test_multinode_max_conc_creates_config_when_below_min(self, sample_runner_config, full_sweep_args_multi_node):
+        """Multinode max_conc below all values should create config with max_conc."""
+        config = {
+            "test-config": {
+                "image": "test-image",
+                "model": "test-model",
+                "model-prefix": "test",
+                "precision": "fp4",
+                "framework": "dynamo-trt",
+                "runner": "gb200",
+                "multinode": True,
+                "seq-len-configs": [
+                    {
+                        "isl": 1024,
+                        "osl": 1024,
+                        "search-space": [
+                            {
+                                "conc-list": [100, 200, 400],
+                                "prefill": {
+                                    "num-worker": 1,
+                                    "tp": 4,
+                                    "ep": 4,
+                                    "dp-attn": False,
+                                },
+                                "decode": {
+                                    "num-worker": 1,
+                                    "tp": 8,
+                                    "ep": 8,
+                                    "dp-attn": False,
+                                },
+                            }
+                        ]
+                    }
+                ]
+            }
+        }
+        full_sweep_args_multi_node.max_conc = 1
+        result = generate_full_sweep(
+            full_sweep_args_multi_node,
+            config,
+            sample_runner_config
+        )
+        # All conc values (100, 200, 400) > max_conc (1), so should use [1]
+        assert len(result) == 1
+        assert result[0]["conc"] == [1]
+
+    def test_combined_max_filters(self, sample_runner_config, full_sweep_args_single_node):
+        """Multiple max filters should all apply and create configs with max values."""
+        config = {
+            "test-config": {
+                "image": "test-image",
+                "model": "test-model",
+                "model-prefix": "test",
+                "precision": "fp4",
+                "framework": "sglang",
+                "runner": "b200",
+                "multinode": False,
+                "seq-len-configs": [
+                    {
+                        "isl": 1024,
+                        "osl": 1024,
+                        "search-space": [
+                            {"tp": 8, "ep": 8, "conc-start": 100, "conc-end": 200}
+                        ]
+                    }
+                ]
+            }
+        }
+        full_sweep_args_single_node.max_tp = 2
+        full_sweep_args_single_node.max_ep = 1
+        full_sweep_args_single_node.max_conc = 1
+        result = generate_full_sweep(
+            full_sweep_args_single_node,
+            config,
+            sample_runner_config
+        )
+        # All values exceed max, so should use max values
+        assert len(result) == 1
+        assert result[0]["tp"] == 2
+        assert result[0]["ep"] == 1
+        assert result[0]["conc"] == 1
diff --git a/utils/matrix_logic/test_validation.py b/utils/matrix_logic/test_validation.py
new file mode 100644
index 000000000..d9cc7f0d9
--- /dev/null
+++ b/utils/matrix_logic/test_validation.py
@@ -0,0 +1,869 @@
+"""Comprehensive tests for validation.py"""
+import pytest
+from validation import (
+    Fields,
+    SingleNodeMatrixEntry,
+    MultiNodeMatrixEntry,
+    WorkerConfig,
+    SingleNodeSearchSpaceEntry,
+    MultiNodeSearchSpaceEntry,
+    SingleNodeSeqLenConfig,
+    MultiNodeSeqLenConfig,
+    SingleNodeMasterConfigEntry,
+    MultiNodeMasterConfigEntry,
+    validate_matrix_entry,
+    validate_master_config,
+    validate_runner_config,
+    load_config_files,
+    load_runner_file,
+)
+
+
+# =============================================================================
+# Test Fixtures
+# =============================================================================
+
+@pytest.fixture
+def valid_single_node_matrix_entry():
+    """Valid single node matrix entry based on dsr1-fp4-mi355x-sglang config."""
+    return {
+        "image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915",
+        "model": "amd/DeepSeek-R1-0528-MXFP4-Preview",
+        "model-prefix": "dsr1",
+        "precision": "fp4",
+        "framework": "sglang",
+        "spec-decoding": "none",
+        "runner": "mi355x",
+        "isl": 1024,
+        "osl": 1024,
+        "tp": 8,
+        "ep": 1,
+        "dp-attn": False,
+        "conc": 4,
+        "max-model-len": 2248,
+        "exp-name": "dsr1_1k1k",
+        "disagg": False,
+    }
+
+
+@pytest.fixture
+def valid_multinode_matrix_entry():
+    """Valid multinode matrix entry based on dsr1-fp4-gb200-dynamo-trt config."""
+    return {
+        "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3",
+        "model": "deepseek-r1-fp4",
+        "model-prefix": "dsr1",
+        "precision": "fp4",
+        "framework": "dynamo-trt",
+        "spec-decoding": "none",
+        "runner": "gb200",
+        "isl": 1024,
+        "osl": 1024,
+        "prefill": {
+            "num-worker": 5,
+            "tp": 4,
+            "ep": 4,
+            "dp-attn": True,
+            "additional-settings": [
+                "PREFILL_MAX_NUM_TOKENS=8448",
+                "PREFILL_MAX_BATCH_SIZE=1",
+            ],
+        },
+        "decode": {
+            "num-worker": 1,
+            "tp": 8,
+            "ep": 8,
+            "dp-attn": True,
+            "additional-settings": [
+                "DECODE_MAX_NUM_TOKENS=256",
+                "DECODE_MAX_BATCH_SIZE=256",
+                "DECODE_GPU_MEM_FRACTION=0.8",
+                "DECODE_MTP_SIZE=0",
+            ],
+        },
+        "conc": [2150],
+        "max-model-len": 2248,
+        "exp-name": "dsr1_1k1k",
+        "disagg": True,
+    }
+
+
+@pytest.fixture
+def valid_single_node_master_config():
+    """Valid single node master config based on dsr1-fp8-mi300x-sglang."""
+    return {
+        "image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915",
+        "model": "deepseek-ai/DeepSeek-R1-0528",
+        "model-prefix": "dsr1",
+        "precision": "fp8",
+        "framework": "sglang",
+        "runner": "mi300x",
+        "multinode": False,
+        "seq-len-configs": [
+            {
+                "isl": 1024,
+                "osl": 1024,
+                "search-space": [
+                    {"tp": 8, "conc-start": 4, "conc-end": 64}
+                ]
+            }
+        ]
+    }
+
+
+@pytest.fixture
+def valid_multinode_master_config():
+    """Valid multinode master config based on dsr1-fp4-gb200-dynamo-trt."""
+    return {
+        "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3",
+        "model": "deepseek-r1-fp4",
+        "model-prefix": "dsr1",
+        "precision": "fp4",
+        "framework": "dynamo-trt",
+        "runner": "gb200",
+        "multinode": True,
+        "disagg": True,
+        "seq-len-configs": [
+            {
+                "isl": 1024,
+                "osl": 1024,
+                "search-space": [
+                    {
+                        "prefill": {
+                            "num-worker": 5,
+                            "tp": 4,
+                            "ep": 4,
+                            "dp-attn": True,
+                            "additional-settings": [
+                                "PREFILL_MAX_NUM_TOKENS=8448",
+                                "PREFILL_MAX_BATCH_SIZE=1",
+                            ],
+                        },
+                        "decode": {
+                            "num-worker": 1,
+                            "tp": 8,
+                            "ep": 8,
+                            "dp-attn": True,
+                            "additional-settings": [
+                                "DECODE_MAX_NUM_TOKENS=256",
+                                "DECODE_MAX_BATCH_SIZE=256",
+                            ],
+                        },
+                        "conc-list": [2150],
+                    }
+                ]
+            }
+        ]
+    }
+
+
+@pytest.fixture
+def valid_runner_config():
+    """Valid runner config based on .github/configs/runners.yaml."""
+    return {
+        "h100": ["h100-cr_0", "h100-cr_1", "h100-cw_0", "h100-cw_1"],
+        "h200": ["h200-cw_0", "h200-cw_1", "h200-nb_0", "h200-nb_1"],
+        "b200": ["b200-nvd_0", "b200-nvd_1", "b200-dgxc_1"],
+        "mi300x": ["mi300x-amd_0", "mi300x-amd_1", "mi300x-cr_0"],
+        "gb200": ["gb200-nv_0"],
+    }
+
+
+# =============================================================================
+# Test Fields Enum
+# =============================================================================
+
+class TestFieldsEnum:
+    """Tests for Fields enum."""
+
+    def test_field_values_are_strings(self):
+        """All field values should be strings."""
+        for field in Fields:
+            assert isinstance(field.value, str)
+
+    def test_key_fields_exist(self):
+        """Key fields should be defined."""
+        assert Fields.IMAGE.value == "image"
+        assert Fields.MODEL.value == "model"
+        assert Fields.TP.value == "tp"
+        assert Fields.MULTINODE.value == "multinode"
+        assert Fields.CONC.value == "conc"
+        assert Fields.SPEC_DECODING.value == "spec-decoding"
+        assert Fields.PREFILL.value == "prefill"
+        assert Fields.DECODE.value == "decode"
+
+
+# =============================================================================
+# Test WorkerConfig
+# =============================================================================
+
+class TestWorkerConfig:
+    """Tests for WorkerConfig model."""
+
+    def test_valid_worker_config(self):
+        """Valid worker config should pass."""
+        config = WorkerConfig(**{
+            "num-worker": 5,
+            "tp": 4,
+            "ep": 4,
+            "dp-attn": True,
+        })
+        assert config.num_worker == 5
+        assert config.tp == 4
+        assert config.ep == 4
+        assert config.dp_attn is True
+
+    def test_worker_config_with_additional_settings(self):
+        """Worker config with additional settings should pass."""
+        config = WorkerConfig(**{
+            "num-worker": 1,
+            "tp": 8,
+            "ep": 8,
+            "dp-attn": True,
+            "additional-settings": [
+                "DECODE_MAX_NUM_TOKENS=256",
+                "DECODE_MAX_BATCH_SIZE=256",
+                "DECODE_GPU_MEM_FRACTION=0.8",
+            ],
+        })
+        assert len(config.additional_settings) == 3
+        assert "DECODE_MAX_NUM_TOKENS=256" in config.additional_settings
+
+    def test_worker_config_missing_required_field(self):
+        """Missing required field should fail."""
+        with pytest.raises(Exception):
+            WorkerConfig(**{
+                "num-worker": 2,
+                "tp": 4,
+                # Missing ep and dp-attn
+            })
+
+    def test_worker_config_extra_field_forbidden(self):
+        """Extra fields should be forbidden."""
+        with pytest.raises(Exception):
+            WorkerConfig(**{
+                "num-worker": 2,
+                "tp": 4,
+                "ep": 1,
+                "dp-attn": False,
+                "unknown-field": "value",
+            })
+
+
+# =============================================================================
+# Test SingleNodeMatrixEntry
+# =============================================================================
+
+class TestSingleNodeMatrixEntry:
+    """Tests for SingleNodeMatrixEntry model."""
+
+    def test_valid_entry(self, valid_single_node_matrix_entry):
+        """Valid entry should pass validation."""
+        entry = SingleNodeMatrixEntry(**valid_single_node_matrix_entry)
+        assert entry.image == "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915"
+        assert entry.tp == 8
+        assert entry.conc == 4
+        assert entry.framework == "sglang"
+
+    def test_conc_as_list(self, valid_single_node_matrix_entry):
+        """Conc can be a list of integers."""
+        valid_single_node_matrix_entry["conc"] = [4, 8, 16, 32, 64]
+        entry = SingleNodeMatrixEntry(**valid_single_node_matrix_entry)
+        assert entry.conc == [4, 8, 16, 32, 64]
+
+    def test_spec_decoding_values(self, valid_single_node_matrix_entry):
+        """Spec decoding should accept valid literal values."""
+        for value in ["mtp", "draft_model", "none"]:
+            valid_single_node_matrix_entry["spec-decoding"] = value
+            entry = SingleNodeMatrixEntry(**valid_single_node_matrix_entry)
+            assert entry.spec_decoding == value
+
+    def test_invalid_spec_decoding(self, valid_single_node_matrix_entry):
+        """Invalid spec decoding value should fail."""
+        valid_single_node_matrix_entry["spec-decoding"] = "invalid"
+        with pytest.raises(Exception):
+            SingleNodeMatrixEntry(**valid_single_node_matrix_entry)
+
+    def test_missing_required_field(self, valid_single_node_matrix_entry):
+        """Missing required field should fail validation."""
+        del valid_single_node_matrix_entry["model"]
+        with pytest.raises(Exception):
+            SingleNodeMatrixEntry(**valid_single_node_matrix_entry)
+
+    def test_extra_field_forbidden(self, valid_single_node_matrix_entry):
+        """Extra fields should be forbidden."""
+        valid_single_node_matrix_entry["extra-field"] = "value"
+        with pytest.raises(Exception):
+            SingleNodeMatrixEntry(**valid_single_node_matrix_entry)
+
+
+# =============================================================================
+# Test MultiNodeMatrixEntry
+# =============================================================================
+
+class TestMultiNodeMatrixEntry:
+    """Tests for MultiNodeMatrixEntry model."""
+
+    def test_valid_entry(self, valid_multinode_matrix_entry):
+        """Valid entry should pass validation."""
+        entry = MultiNodeMatrixEntry(**valid_multinode_matrix_entry)
+        assert entry.model == "deepseek-r1-fp4"
+        assert entry.conc == [2150]
+        assert entry.disagg is True
+
+    def test_prefill_decode_worker_configs(self, valid_multinode_matrix_entry):
+        """Prefill and decode should be WorkerConfig objects."""
+        entry = MultiNodeMatrixEntry(**valid_multinode_matrix_entry)
+        assert entry.prefill.num_worker == 5
+        assert entry.prefill.tp == 4
+        assert entry.decode.tp == 8
+        assert entry.decode.dp_attn is True
+
+    def test_conc_must_be_list(self, valid_multinode_matrix_entry):
+        """Conc must be a list for multinode."""
+        valid_multinode_matrix_entry["conc"] = 2150  # Single int, not list
+        with pytest.raises(Exception):
+            MultiNodeMatrixEntry(**valid_multinode_matrix_entry)
+
+    def test_missing_prefill(self, valid_multinode_matrix_entry):
+        """Missing prefill should fail."""
+        del valid_multinode_matrix_entry["prefill"]
+        with pytest.raises(Exception):
+            MultiNodeMatrixEntry(**valid_multinode_matrix_entry)
+
+    def test_missing_decode(self, valid_multinode_matrix_entry):
+        """Missing decode should fail."""
+        del valid_multinode_matrix_entry["decode"]
+        with pytest.raises(Exception):
+            MultiNodeMatrixEntry(**valid_multinode_matrix_entry)
+
+
+# =============================================================================
+# Test validate_matrix_entry function
+# =============================================================================
+
+class TestValidateMatrixEntry:
+    """Tests for validate_matrix_entry function."""
+
+    def test_valid_single_node(self, valid_single_node_matrix_entry):
+        """Valid single node entry should return the entry."""
+        result = validate_matrix_entry(valid_single_node_matrix_entry, is_multinode=False)
+        assert result == valid_single_node_matrix_entry
+
+    def test_valid_multinode(self, valid_multinode_matrix_entry):
+        """Valid multinode entry should return the entry."""
+        result = validate_matrix_entry(valid_multinode_matrix_entry, is_multinode=True)
+        assert result == valid_multinode_matrix_entry
+
+    def test_invalid_single_node_raises_valueerror(self, valid_single_node_matrix_entry):
+        """Invalid single node entry should raise ValueError."""
+        del valid_single_node_matrix_entry["tp"]
+        with pytest.raises(ValueError) as exc_info:
+            validate_matrix_entry(valid_single_node_matrix_entry, is_multinode=False)
+        assert "failed validation" in str(exc_info.value)
+
+    def test_invalid_multinode_raises_valueerror(self, valid_multinode_matrix_entry):
+        """Invalid multinode entry should raise ValueError."""
+        del valid_multinode_matrix_entry["prefill"]
+        with pytest.raises(ValueError) as exc_info:
+            validate_matrix_entry(valid_multinode_matrix_entry, is_multinode=True)
+        assert "failed validation" in str(exc_info.value)
+
+
+# =============================================================================
+# Test SingleNodeSearchSpaceEntry
+# =============================================================================
+
+class TestSingleNodeSearchSpaceEntry:
+    """Tests for SingleNodeSearchSpaceEntry model."""
+
+    def test_valid_with_conc_range(self):
+        """Valid entry with conc range should pass (like mi300x config)."""
+        entry = SingleNodeSearchSpaceEntry(**{
+            "tp": 8,
+            "conc-start": 4,
+            "conc-end": 64,
+        })
+        assert entry.tp == 8
+        assert entry.conc_start == 4
+        assert entry.conc_end == 64
+
+    def test_valid_with_conc_list(self):
+        """Valid entry with conc list should pass."""
+        entry = SingleNodeSearchSpaceEntry(**{
+            "tp": 4,
+            "conc-list": [4, 8, 16, 32, 64, 128],
+        })
+        assert entry.conc_list == [4, 8, 16, 32, 64, 128]
+
+    def test_cannot_have_both_range_and_list(self):
+        """Cannot specify both conc range and list."""
+        with pytest.raises(Exception) as exc_info:
+            SingleNodeSearchSpaceEntry(**{
+                "tp": 4,
+                "conc-start": 4,
+                "conc-end": 64,
+                "conc-list": [4, 8, 16],
+            })
+        assert "Cannot specify both" in str(exc_info.value)
+
+    def test_must_have_range_or_list(self):
+        """Must specify either conc range or list."""
+        with pytest.raises(Exception) as exc_info:
+            SingleNodeSearchSpaceEntry(**{
+                "tp": 8,
+            })
+        assert "Must specify either" in str(exc_info.value)
+
+    def test_conc_start_must_be_lte_conc_end(self):
+        """conc-start must be <= conc-end."""
+        with pytest.raises(Exception) as exc_info:
+            SingleNodeSearchSpaceEntry(**{
+                "tp": 8,
+                "conc-start": 64,
+                "conc-end": 4,
+            })
+        assert "must be <=" in str(exc_info.value)
+
+    def test_conc_list_values_must_be_positive(self):
+        """conc-list values must be > 0."""
+        with pytest.raises(Exception) as exc_info:
+            SingleNodeSearchSpaceEntry(**{
+                "tp": 4,
+                "conc-list": [4, 0, 16],
+            })
+        assert "must be greater than 0" in str(exc_info.value)
+
+    def test_optional_fields_defaults(self):
+        """Optional fields should have correct defaults."""
+        entry = SingleNodeSearchSpaceEntry(**{
+            "tp": 8,
+            "conc-list": [4, 8],
+        })
+        assert entry.ep is None
+        assert entry.dp_attn is None
+        assert entry.spec_decoding == "none"
+
+    def test_with_ep_and_dp_attn(self):
+        """Entry with ep and dp-attn like b200-sglang config."""
+        entry = SingleNodeSearchSpaceEntry(**{
+            "tp": 4,
+            "ep": 4,
+            "dp-attn": True,
+            "conc-start": 4,
+            "conc-end": 128,
+        })
+        assert entry.ep == 4
+        assert entry.dp_attn is True
+
+    def test_with_spec_decoding_mtp(self):
+        """Entry with mtp spec decoding."""
+        entry = SingleNodeSearchSpaceEntry(**{
+            "tp": 8,
+            "spec-decoding": "mtp",
+            "conc-list": [1, 2, 4],
+        })
+        assert entry.spec_decoding == "mtp"
+
+
+# =============================================================================
+# Test MultiNodeSearchSpaceEntry
+# =============================================================================
+
+class TestMultiNodeSearchSpaceEntry:
+    """Tests for MultiNodeSearchSpaceEntry model."""
+
+    def test_valid_with_conc_list(self):
+        """Valid multinode search space with list (like gb200 config)."""
+        entry = MultiNodeSearchSpaceEntry(**{
+            "prefill": {
+                "num-worker": 5,
+                "tp": 4,
+                "ep": 4,
+                "dp-attn": True,
+                "additional-settings": ["PREFILL_MAX_NUM_TOKENS=8448"],
+            },
+            "decode": {
+                "num-worker": 1,
+                "tp": 8,
+                "ep": 8,
+                "dp-attn": True,
+                "additional-settings": ["DECODE_MAX_NUM_TOKENS=256"],
+            },
+            "conc-list": [2150],
+        })
+        assert entry.prefill.num_worker == 5
+        assert entry.decode.tp == 8
+
+    def test_valid_with_conc_range(self):
+        """Valid multinode search space with range."""
+        entry = MultiNodeSearchSpaceEntry(**{
+            "prefill": {
+                "num-worker": 1,
+                "tp": 4,
+                "ep": 4,
+                "dp-attn": False,
+            },
+            "decode": {
+                "num-worker": 4,
+                "tp": 8,
+                "ep": 8,
+                "dp-attn": False,
+            },
+            "conc-start": 1,
+            "conc-end": 64,
+        })
+        assert entry.conc_start == 1
+        assert entry.conc_end == 64
+
+    def test_with_spec_decoding_mtp(self):
+        """Multinode entry with mtp spec decoding."""
+        entry = MultiNodeSearchSpaceEntry(**{
+            "spec-decoding": "mtp",
+            "prefill": {
+                "num-worker": 1,
+                "tp": 4,
+                "ep": 4,
+                "dp-attn": False,
+            },
+            "decode": {
+                "num-worker": 4,
+                "tp": 8,
+                "ep": 8,
+                "dp-attn": False,
+            },
+            "conc-list": [1, 2, 4, 8, 16, 36],
+        })
+        assert entry.spec_decoding == "mtp"
+
+    def test_missing_conc_specification(self):
+        """Missing conc specification should fail."""
+        with pytest.raises(Exception):
+            MultiNodeSearchSpaceEntry(**{
+                "prefill": {
+                    "num-worker": 2,
+                    "tp": 4,
+                    "ep": 4,
+                    "dp-attn": False,
+                },
+                "decode": {
+                    "num-worker": 2,
+                    "tp": 4,
+                    "ep": 4,
+                    "dp-attn": False,
+                },
+                # Missing conc specification
+            })
+
+
+# =============================================================================
+# Test SeqLenConfig models
+# =============================================================================
+
+class TestSeqLenConfigs:
+    """Tests for sequence length config models."""
+
+    def test_single_node_seq_len_config_1k1k(self):
+        """Valid single node seq len config for 1k/1k."""
+        config = SingleNodeSeqLenConfig(**{
+            "isl": 1024,
+            "osl": 1024,
+            "search-space": [
+                {"tp": 8, "conc-start": 4, "conc-end": 64}
+            ]
+        })
+        assert config.isl == 1024
+        assert config.osl == 1024
+        assert len(config.search_space) == 1
+
+    def test_single_node_seq_len_config_8k1k(self):
+        """Valid single node seq len config for 8k/1k."""
+        config = SingleNodeSeqLenConfig(**{
+            "isl": 8192,
+            "osl": 1024,
+            "search-space": [
+                {"tp": 8, "conc-start": 4, "conc-end": 64}
+            ]
+        })
+        assert config.isl == 8192
+        assert config.osl == 1024
+
+    def test_multinode_seq_len_config(self):
+        """Valid multinode seq len config."""
+        config = MultiNodeSeqLenConfig(**{
+            "isl": 1024,
+            "osl": 1024,
+            "search-space": [
+                {
+                    "prefill": {
+                        "num-worker": 5,
+                        "tp": 4,
+                        "ep": 4,
+                        "dp-attn": True,
+                    },
+                    "decode": {
+                        "num-worker": 1,
+                        "tp": 8,
+                        "ep": 8,
+                        "dp-attn": True,
+                    },
+                    "conc-list": [2150],
+                }
+            ]
+        })
+        assert config.isl == 1024
+        assert config.osl == 1024
+
+
+# =============================================================================
+# Test MasterConfigEntry models
+# =============================================================================
+
+class TestMasterConfigEntries:
+    """Tests for master config entry models."""
+
+    def test_single_node_master_config(self, valid_single_node_master_config):
+        """Valid single node master config."""
+        config = SingleNodeMasterConfigEntry(**valid_single_node_master_config)
+        assert config.multinode is False
+        assert config.model_prefix == "dsr1"
+        assert config.runner == "mi300x"
+        assert config.framework == "sglang"
+
+    def test_multinode_master_config(self, valid_multinode_master_config):
+        """Valid multinode master config."""
+        config = MultiNodeMasterConfigEntry(**valid_multinode_master_config)
+        assert config.multinode is True
+        assert config.model_prefix == "dsr1"
+        assert config.runner == "gb200"
+        assert config.disagg is True
+
+    def test_single_node_cannot_have_multinode_true(self, valid_single_node_master_config):
+        """Single node config must have multinode=False."""
+        valid_single_node_master_config["multinode"] = True
+        with pytest.raises(Exception):
+            SingleNodeMasterConfigEntry(**valid_single_node_master_config)
+
+    def test_multinode_cannot_have_multinode_false(self, valid_multinode_master_config):
+        """Multinode config must have multinode=True."""
+        valid_multinode_master_config["multinode"] = False
+        with pytest.raises(Exception):
+            MultiNodeMasterConfigEntry(**valid_multinode_master_config)
+
+    def test_disagg_default_false(self, valid_single_node_master_config):
+        """Disagg should default to False."""
+        config = SingleNodeMasterConfigEntry(**valid_single_node_master_config)
+        assert config.disagg is False
+
+
+# =============================================================================
+# Test validate_master_config function
+# =============================================================================
+
+class TestValidateMasterConfig:
+    """Tests for validate_master_config function."""
+
+    def test_valid_single_node_config(self, valid_single_node_master_config):
+        """Valid single node config should pass."""
+        configs = {"dsr1-fp8-mi300x-sglang": valid_single_node_master_config}
+        result = validate_master_config(configs)
+        assert result == configs
+
+    def test_valid_multinode_config(self, valid_multinode_master_config):
+        """Valid multinode config should pass."""
+        configs = {"dsr1-fp4-gb200-dynamo-trt": valid_multinode_master_config}
+        result = validate_master_config(configs)
+        assert result == configs
+
+    def test_mixed_configs(self, valid_single_node_master_config, valid_multinode_master_config):
+        """Mixed single and multinode configs should pass."""
+        configs = {
+            "dsr1-fp8-mi300x-sglang": valid_single_node_master_config,
+            "dsr1-fp4-gb200-dynamo-trt": valid_multinode_master_config,
+        }
+        result = validate_master_config(configs)
+        assert len(result) == 2
+
+    def test_invalid_config_raises_valueerror(self, valid_single_node_master_config):
+        """Invalid config should raise ValueError with key name."""
+        del valid_single_node_master_config["model"]
+        configs = {"broken-config": valid_single_node_master_config}
+        with pytest.raises(ValueError) as exc_info:
+            validate_master_config(configs)
+        assert "broken-config" in str(exc_info.value)
+        assert "failed validation" in str(exc_info.value)
+
+
+# =============================================================================
+# Test validate_runner_config function
+# =============================================================================
+
+class TestValidateRunnerConfig:
+    """Tests for validate_runner_config function."""
+
+    def test_valid_runner_config(self, valid_runner_config):
+        """Valid runner config should pass."""
+        result = validate_runner_config(valid_runner_config)
+        assert result == valid_runner_config
+
+    def test_value_must_be_list(self):
+        """Runner config values must be lists."""
+        config = {
+            "h100": "h100-cr_0",  # Not a list
+        }
+        with pytest.raises(ValueError) as exc_info:
+            validate_runner_config(config)
+        assert "must be a list" in str(exc_info.value)
+
+    def test_list_must_contain_strings(self):
+        """Runner config lists must contain only strings."""
+        config = {
+            "h100": ["h100-cr_0", 123],  # Contains non-string
+        }
+        with pytest.raises(ValueError) as exc_info:
+            validate_runner_config(config)
+        assert "must contain only strings" in str(exc_info.value)
+
+    def test_list_cannot_be_empty(self):
+        """Runner config lists cannot be empty."""
+        config = {
+            "mi355x": [],
+        }
+        with pytest.raises(ValueError) as exc_info:
+            validate_runner_config(config)
+        assert "cannot be an empty list" in str(exc_info.value)
+
+    def test_multiple_runner_types(self, valid_runner_config):
+        """Multiple runner types should work."""
+        result = validate_runner_config(valid_runner_config)
+        assert "h100" in result
+        assert "h200" in result
+        assert "mi300x" in result
+        assert "gb200" in result
+
+
+# =============================================================================
+# Test load_config_files
+# =============================================================================
+
+class TestLoadConfigFiles:
+    """Tests for load_config_files function."""
+
+    def test_load_single_file_with_validation(self, tmp_path, valid_single_node_master_config):
+        """Should load and validate a single config file."""
+        config_file = tmp_path / "config.yaml"
+        import yaml
+        config_file.write_text(yaml.dump({"test-config": valid_single_node_master_config}))
+        result = load_config_files([str(config_file)])
+        assert "test-config" in result
+        assert result["test-config"]["image"] == valid_single_node_master_config["image"]
+
+    def test_load_single_file_without_validation(self, tmp_path):
+        """Should load a single config file without validation when validate=False."""
+        config_file = tmp_path / "config.yaml"
+        config_file.write_text("""
+test-config:
+  image: test-image
+  model: test-model
+""")
+        result = load_config_files([str(config_file)], validate=False)
+        assert "test-config" in result
+        assert result["test-config"]["image"] == "test-image"
+
+    def test_load_multiple_files(self, tmp_path):
+        """Should merge multiple config files."""
+        config1 = tmp_path / "config1.yaml"
+        config1.write_text("""
+config-one:
+  value: 1
+""")
+        config2 = tmp_path / "config2.yaml"
+        config2.write_text("""
+config-two:
+  value: 2
+""")
+        result = load_config_files([str(config1), str(config2)], validate=False)
+        assert "config-one" in result
+        assert "config-two" in result
+
+    def test_duplicate_keys_raise_error(self, tmp_path):
+        """Duplicate keys across files should raise error."""
+        config1 = tmp_path / "config1.yaml"
+        config1.write_text("""
+duplicate-key:
+  value: 1
+""")
+        config2 = tmp_path / "config2.yaml"
+        config2.write_text("""
+duplicate-key:
+  value: 2
+""")
+        with pytest.raises(ValueError) as exc_info:
+            load_config_files([str(config1), str(config2)], validate=False)
+        assert "Duplicate configuration keys" in str(exc_info.value)
+
+    def test_nonexistent_file_raises_error(self):
+        """Nonexistent file should raise error."""
+        with pytest.raises(ValueError) as exc_info:
+            load_config_files(["nonexistent.yaml"])
+        assert "does not exist" in str(exc_info.value)
+
+    def test_validation_runs_by_default(self, tmp_path):
+        """Validation should run by default and catch invalid configs."""
+        config_file = tmp_path / "config.yaml"
+        config_file.write_text("""
+invalid-config:
+  image: test-image
+  # Missing required fields like model, model-prefix, precision, etc.
+""")
+        with pytest.raises(ValueError) as exc_info:
+            load_config_files([str(config_file)])
+        assert "failed validation" in str(exc_info.value)
+
+
+# =============================================================================
+# Test load_runner_file
+# =============================================================================
+
+class TestLoadRunnerFile:
+    """Tests for load_runner_file function."""
+
+    def test_load_runner_file_with_validation(self, tmp_path):
+        """Should load and validate runner config file."""
+        runner_file = tmp_path / "runners.yaml"
+        runner_file.write_text("""
+h100:
+- h100-node-0
+- h100-node-1
+""")
+        result = load_runner_file(str(runner_file))
+        assert "h100" in result
+        assert len(result["h100"]) == 2
+
+    def test_load_runner_file_without_validation(self, tmp_path):
+        """Should load runner config file without validation when validate=False."""
+        runner_file = tmp_path / "runners.yaml"
+        runner_file.write_text("""
+h100:
+- h100-node-0
+- h100-node-1
+""")
+        result = load_runner_file(str(runner_file), validate=False)
+        assert "h100" in result
+        assert len(result["h100"]) == 2
+
+    def test_nonexistent_runner_file(self):
+        """Nonexistent runner file should raise error."""
+        with pytest.raises(ValueError) as exc_info:
+            load_runner_file("nonexistent.yaml")
+        assert "does not exist" in str(exc_info.value)
+
+    def test_validation_runs_by_default(self, tmp_path):
+        """Validation should run by default and catch invalid configs."""
+        runner_file = tmp_path / "runners.yaml"
+        runner_file.write_text("""
+h100: not-a-list
+""")
+        with pytest.raises(ValueError) as exc_info:
+            load_runner_file(str(runner_file))
+        assert "must be a list" in str(exc_info.value)
diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py
new file mode 100644
index 000000000..4d79d27a9
--- /dev/null
+++ b/utils/matrix_logic/validation.py
@@ -0,0 +1,438 @@
+from pydantic import BaseModel, Field, ValidationError, ConfigDict, model_validator
+from typing import List, Optional, Union, Literal
+from enum import Enum
+
+import pprint
+import yaml
+
+"""
+    The below class defines the field names expected to be present in the JSON entries
+    for both single-node and multi-node configurations.
+"""
+
+
+class Fields(Enum):
+    # Field name constants
+    # Top-level config fields
+    IMAGE = 'image'
+    MODEL = 'model'
+    MODEL_PREFIX = 'model-prefix'
+    PRECISION = 'precision'
+    FRAMEWORK = 'framework'
+    RUNNER = 'runner'
+    SEQ_LEN_CONFIGS = 'seq-len-configs'
+    MULTINODE = 'multinode'
+
+    # Seq-len-config fields
+    ISL = 'isl'
+    OSL = 'osl'
+    SEARCH_SPACE = 'search-space'
+
+    # Search-space/benchmark fields
+    TP = 'tp'
+    CONC_START = 'conc-start'
+    CONC_END = 'conc-end'
+    CONC_LIST = 'conc-list'
+    EP = 'ep'
+    DP_ATTN = 'dp-attn'
+
+    # Multinode-specific fields (when MULTINODE = true)
+    SPEC_DECODING = 'spec-decoding'
+    PREFILL = 'prefill'
+    DECODE = 'decode'
+    NUM_WORKER = 'num-worker'
+    BATCH_SIZE = 'batch-size'
+    MAX_NUM_TOKENS = 'max-num-tokens'
+    ADDITIONAL_SETTINGS = 'additional-settings'
+
+    # Matrix entry fields
+    CONC = 'conc'
+    MAX_MODEL_LEN = 'max-model-len'
+    EXP_NAME = 'exp-name'
+    DISAGG = 'disagg'
+
+    # Eval
+    FIELD_RUN_EVAL = 'run-eval'
+
+
+"""
+    Below is the validation logic for the OUTPUT of utils/matrix_logic/generate_sweep_configs.py, i.e., 
+    the input to the actual workflow files. The validation enforces a strict set of rules on the structure
+    of the generated matrix entries to ensure correctness before proceeding with benchmarking. This ensures
+    that no validation has to happen in the workflow itself, i.e., at runtime, it is assumed that all inputs
+    are valid. Threfore, there should not be any default values set in these Pydantic models. Any missing value
+    should raise a validation error.
+"""
+
+
+class SingleNodeMatrixEntry(BaseModel):
+    """Pydantic model for validating single node matrix entry structure.
+    This validates the input that should be expected to .github/workflows/benchmark-tmpl.yml"""
+    model_config = ConfigDict(extra='forbid', populate_by_name=True)
+
+    image: str
+    model: str
+    model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value)
+    precision: str
+    framework: str
+    spec_decoding: Literal["mtp", "draft_model", "none"] = Field(
+        alias=Fields.SPEC_DECODING.value
+    )
+    runner: str
+    isl: int
+    osl: int
+    tp: int
+    ep: int
+    dp_attn: bool = Field(alias=Fields.DP_ATTN.value)
+    conc: Union[int, List[int]]
+    max_model_len: int = Field(alias=Fields.MAX_MODEL_LEN.value)
+    exp_name: str = Field(alias=Fields.EXP_NAME.value)
+    disagg: bool
+    run_eval: bool = Field(alias='run-eval', default=False)
+
+
+class WorkerConfig(BaseModel):
+    """Pydantic model for validating worker configuration in multinode entries."""
+    model_config = ConfigDict(extra='forbid', populate_by_name=True)
+
+    num_worker: int = Field(alias=Fields.NUM_WORKER.value)
+    tp: int
+    ep: int
+    dp_attn: bool = Field(alias=Fields.DP_ATTN.value)
+    additional_settings: Optional[List[str]] = Field(
+        default=[], alias=Fields.ADDITIONAL_SETTINGS.value)
+
+
+class MultiNodeMatrixEntry(BaseModel):
+    """Pydantic model for validating multinode matrix entry structure.
+    This validates the input that should be expected to .github/workflows/benchmark-multinode-tmpl.yml"""
+    model_config = ConfigDict(extra='forbid', populate_by_name=True)
+
+    image: str
+    model: str
+    model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value)
+    precision: str
+    framework: str
+    spec_decoding: Literal["mtp", "draft_model", "none"] = Field(
+        alias=Fields.SPEC_DECODING.value
+    )
+    runner: str
+    isl: int
+    osl: int
+    prefill: WorkerConfig
+    decode: WorkerConfig
+    conc: List[int]
+    max_model_len: int = Field(alias=Fields.MAX_MODEL_LEN.value)
+    exp_name: str = Field(alias=Fields.EXP_NAME.value)
+    disagg: bool
+    run_eval: bool = Field(alias='run-eval', default=False)
+
+
+def validate_matrix_entry(entry: dict, is_multinode: bool) -> dict:
+    """Validate that matrix_values entries match the expected structure.
+
+    Raises ValueError if any entry fails validation.
+    Returns the original list if all entries are valid.
+    """
+    try:
+        if is_multinode:
+            MultiNodeMatrixEntry(**entry)
+        else:
+            SingleNodeMatrixEntry(**entry)
+    except ValidationError as e:
+        raise ValueError(
+            f"The following parsed matrix entry failed validation:\n{pprint.pformat(entry)}\n{e}")
+    return entry
+
+
+"""
+    Below is the validation logic for the INPUT to utils/matrix_logic/generate_sweep_configs.py, i.e., 
+    the master configuration files found in .github/configs. The validation enforces a strict set of 
+    rules on the structure of the master configuration files to ensure correctness before proceeding 
+    with matrix generation.
+"""
+
+
+def _validate_conc_fields(self):
+    """Ensure either (conc_start AND conc_end) OR conc_list is provided, but not both."""
+    has_range = self.conc_start is not None and self.conc_end is not None
+    has_list = self.conc_list is not None and len(self.conc_list) > 0
+
+    if has_range and has_list:
+        raise ValueError(
+            f"Cannot specify both '{Fields.CONC_LIST.value}' list and "
+            f"'{Fields.CONC_START.value}'/'{Fields.CONC_END.value}'. "
+            "Use either a list or a range, not both."
+        )
+
+    if not has_range and not has_list:
+        raise ValueError(
+            f"Must specify either '{Fields.CONC_LIST.value}' list or both "
+            f"'{Fields.CONC_START.value}' and '{Fields.CONC_END.value}'."
+        )
+
+    if has_range:
+        if self.conc_start is None or self.conc_end is None:
+            raise ValueError(
+                f"Both '{Fields.CONC_START.value}' and '{Fields.CONC_END.value}' "
+                "must be provided together."
+            )
+
+        if self.conc_start > self.conc_end:
+            raise ValueError(
+                f"'{Fields.CONC_START.value}' ({self.conc_start}) must be <= "
+                f"'{Fields.CONC_END.value}' ({self.conc_end})."
+            )
+
+    if has_list:
+        if not all(x > 0 for x in self.conc_list):
+            raise ValueError(
+                f"Input '{Fields.CONC_LIST.value}' entries must be greater than 0."
+            )
+
+    return self
+
+
+class SingleNodeSearchSpaceEntry(BaseModel):
+    """Single node search space configuration."""
+    model_config = ConfigDict(extra='forbid', populate_by_name=True)
+
+    tp: int
+    ep: Optional[int] = None
+    spec_decoding: Literal["mtp", "draft_model", "none"] = Field(
+        default="none", alias=Fields.SPEC_DECODING.value)
+    dp_attn: Optional[bool] = Field(
+        default=None, alias=Fields.DP_ATTN.value)
+    conc_start: Optional[int] = Field(
+        default=None, alias=Fields.CONC_START.value)
+    conc_end: Optional[int] = Field(
+        default=None, alias=Fields.CONC_END.value)
+    conc_list: Optional[List[int]] = Field(
+        default=None, alias=Fields.CONC_LIST.value)
+
+    @model_validator(mode='after')
+    def validate_conc_fields(self):
+        return _validate_conc_fields(self)
+
+
+class MultiNodeSearchSpaceEntry(BaseModel):
+    """Multinode search space configuration."""
+    model_config = ConfigDict(extra='forbid', populate_by_name=True)
+
+    spec_decoding: Literal["mtp", "draft_model", "none"] = Field(
+        default="none", alias=Fields.SPEC_DECODING.value)
+    prefill: WorkerConfig
+    decode: WorkerConfig
+    conc_start: Optional[int] = Field(
+        default=None, alias=Fields.CONC_START.value)
+    conc_end: Optional[int] = Field(
+        default=None, alias=Fields.CONC_END.value)
+    conc_list: Optional[List[int]] = Field(
+        default=None, alias=Fields.CONC_LIST.value)
+
+    @model_validator(mode='after')
+    def validate_conc_fields(self):
+        return _validate_conc_fields(self)
+
+
+class SingleNodeSeqLenConfig(BaseModel):
+    """Single node sequence length configuration."""
+    model_config = ConfigDict(extra='forbid', populate_by_name=True)
+
+    isl: int
+    osl: int
+    search_space: List[SingleNodeSearchSpaceEntry] = Field(
+        alias=Fields.SEARCH_SPACE.value)
+
+
+class MultiNodeSeqLenConfig(BaseModel):
+    """Multinode sequence length configuration."""
+    model_config = ConfigDict(extra='forbid', populate_by_name=True)
+
+    isl: int
+    osl: int
+    search_space: List[MultiNodeSearchSpaceEntry] = Field(
+        alias=Fields.SEARCH_SPACE.value)
+
+
+class SingleNodeMasterConfigEntry(BaseModel):
+    """Top-level single node master configuration entry."""
+    model_config = ConfigDict(extra='forbid', populate_by_name=True)
+
+    image: str
+    model: str
+    model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value)
+    precision: str
+    framework: str
+    runner: str
+    multinode: Literal[False]
+    disagg: bool = Field(default=False)
+    seq_len_configs: List[SingleNodeSeqLenConfig] = Field(
+        alias=Fields.SEQ_LEN_CONFIGS.value)
+
+
+class MultiNodeMasterConfigEntry(BaseModel):
+    """Top-level multinode master configuration entry."""
+    model_config = ConfigDict(extra='forbid', populate_by_name=True)
+
+    image: str
+    model: str
+    model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value)
+    precision: str
+    framework: str
+    runner: str
+    multinode: Literal[True]
+    disagg: bool = Field(default=False)
+    seq_len_configs: List[MultiNodeSeqLenConfig] = Field(
+        alias=Fields.SEQ_LEN_CONFIGS.value)
+
+
+def validate_master_config(master_configs: dict) -> List[dict]:
+    """Validate input master configuration structure."""
+    for key, entry in master_configs.items():
+        is_multinode = entry.get('multinode', False)
+
+        try:
+            if is_multinode:
+                MultiNodeMasterConfigEntry(**entry)
+            else:
+                SingleNodeMasterConfigEntry(**entry)
+        except ValidationError as e:
+            raise ValueError(
+                f"Master config entry '{key}' failed validation:\n{e}")
+    return master_configs
+
+# Runner Config Validation
+
+
+def validate_runner_config(runner_configs: dict) -> List[dict]:
+    """Validate input master configuration structure."""
+    for key, value in runner_configs.items():
+        if not isinstance(value, list):
+            raise ValueError(
+                f"Runner config entry '{key}' must be a list, got {type(value).__name__}")
+
+        if not all(isinstance(item, str) for item in value):
+            raise ValueError(
+                f"Runner config entry '{key}' must contain only strings")
+
+        if not value:
+            raise ValueError(
+                f"Runner config entry '{key}' cannot be an empty list")
+
+    return runner_configs
+
+
+"""
+    Below is the validation logic for the changelog entries found in perf-changelog.yaml.
+    This ensures that the changelog entries conform to the expected structure before
+    proceeding with processing.
+"""
+
+
+class ChangelogEntry(BaseModel):
+    """Pydantic model for validating changelog entry structure."""
+    model_config = ConfigDict(extra="forbid", populate_by_name=True)
+
+    config_keys: list[str] = Field(alias="config-keys", min_length=1)
+    description: str
+
+
+class ChangelogMetadata(BaseModel):
+    """Pydantic model for validating changelog metadata structure."""
+    model_config = ConfigDict(extra="forbid")
+
+    base_ref: str
+    head_ref: str
+    entries: list[ChangelogEntry]
+
+
+class ChangelogMatrixEntry(BaseModel):
+    """Pydantic model for validating final changelog matrix entry structure.
+    This imposes a strict contract on the output of process_changelog.py, dictated by
+    the expected input to the run-sweep.yml workflow file.
+    """
+    model_config = ConfigDict(extra="forbid", populate_by_name=True)
+
+    single_node: dict[str, list[SingleNodeMatrixEntry]
+                      ] = Field(default_factory=dict)
+    multi_node: dict[str, list[MultiNodeMatrixEntry]
+                     ] = Field(default_factory=dict)
+    changelog_metadata: ChangelogMetadata
+
+
+# =============================================================================
+# File Loading Functions
+# =============================================================================
+
+
+def load_config_files(config_files: List[str], validate: bool = True) -> dict:
+    """Load and merge configuration files.
+
+    Args:
+        config_files: List of paths to YAML configuration files.
+        validate: If True, run validate_master_config on loaded data. Defaults to True.
+
+    Returns:
+        Merged configuration dictionary.
+
+    Raises:
+        ValueError: If file doesn't exist, isn't a dict, or has duplicate keys.
+    """
+    all_config_data = {}
+    for config_file in config_files:
+        try:
+            with open(config_file, 'r') as f:
+                config_data = yaml.safe_load(f)
+                assert isinstance(
+                    config_data, dict), f"Config file '{config_file}' must contain a dictionary"
+
+                # Don't allow '*' wildcard in master config keys as we need to reserve these
+                # for expansion in process_changelog.py
+                for key in config_data.keys():
+                    if "*" in key:
+                        raise ValueError(
+                            f" Wildcard '*' is not allowed in master config keys: '{key}'")
+
+                # Check for duplicate keys
+                duplicate_keys = set(all_config_data.keys()) & set(
+                    config_data.keys())
+                if duplicate_keys:
+                    raise ValueError(
+                        f"Duplicate configuration keys found in '{config_file}': {', '.join(sorted(duplicate_keys))}"
+                    )
+
+                all_config_data.update(config_data)
+        except FileNotFoundError:
+            raise ValueError(f"Input file '{config_file}' does not exist.")
+
+    if validate:
+        validate_master_config(all_config_data)
+
+    return all_config_data
+
+
+def load_runner_file(runner_file: str, validate: bool = True) -> dict:
+    """Load runner configuration file.
+
+    Args:
+        runner_file: Path to the runner YAML configuration file.
+        validate: If True, run validate_runner_config on loaded data. Defaults to True.
+
+    Returns:
+        Runner configuration dictionary.
+
+    Raises:
+        ValueError: If file doesn't exist or fails validation.
+    """
+    try:
+        with open(runner_file, 'r') as f:
+            runner_config = yaml.safe_load(f)
+    except FileNotFoundError:
+        raise ValueError(
+            f"Runner config file '{runner_file}' does not exist.")
+
+    if validate:
+        validate_runner_config(runner_config)
+
+    return runner_config

From 9d4b2179486845c77dd496178812f85be293f817 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 15 Dec 2025 11:55:36 -0600
Subject: [PATCH 177/214] pt 2 manual merge conflict fixes

---
 .github/workflows/run-sweep.yml              | 1 +
 perf-changelog.yaml                          | 4 ++++
 utils/matrix_logic/generate_sweep_configs.py | 2 +-
 utils/process_changelog.py                   | 1 +
 4 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml
index e449942d1..6a459de40 100644
--- a/.github/workflows/run-sweep.yml
+++ b/.github/workflows/run-sweep.yml
@@ -142,6 +142,7 @@ jobs:
             conc: ${{ matrix.config.conc }}
             spec-decoding: ${{ matrix.config.spec-decoding }}
             disagg: ${{ matrix.config.disagg }}
+            run-eval: ${{ matrix.config.run-eval }}
 
     sweep-single-node-1k8k:
         needs: setup
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 112145f10..6098f931f 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -81,3 +81,7 @@
     - Update vLLM image for NVIDIA configs from vLLM 0.11.0 to vLLM 0.11.2
     - Adds kv-cache-dtype: fp8 to benchmarks/gptoss_fp4_b200_docker.sh
     PR: https://github.com/InferenceMAX/InferenceMAX/pull/273
+- config-keys:
+    - gptoss-fp4-mi300x-vllm
+  description: |
+    - Test evals
diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py
index cba89c448..7ca0f2996 100644
--- a/utils/matrix_logic/generate_sweep_configs.py
+++ b/utils/matrix_logic/generate_sweep_configs.py
@@ -571,7 +571,7 @@ def main():
         '--run-evals',
         action='store_true',
         required=False,
-        help='When specifiedm run evals on a subset of configs.'
+        help='When specified, run evals on a subset of configs.'
     )
 
     # Create main parser
diff --git a/utils/process_changelog.py b/utils/process_changelog.py
index 4a856c9a8..fc40baaf4 100644
--- a/utils/process_changelog.py
+++ b/utils/process_changelog.py
@@ -115,6 +115,7 @@ def main():
                     *MASTER_CONFIGS,
                     "--runner-config",
                     RUNNER_CONFIG,
+                    "--run-evals"
                 ],
                 capture_output=True,
                 text=True,

From a9fad5b2014e5c3bb286a01ca7d894c418ecb9d7 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 15 Dec 2025 13:04:55 -0600
Subject: [PATCH 178/214] use double quotes for gha parsing

---
 .github/workflows/benchmark-tmpl.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 3379e9d15..e10a040bc 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -84,7 +84,7 @@ jobs:
   benchmark:
     runs-on: ${{ inputs.runner }}
     timeout-minutes: 180
-    name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.framework }} ${{ inputs.precision }} ${{ inputs.run-eval && ''eval '' || '''' }}tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} conc=${{ inputs.conc }}'
+    name: "${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.framework }} ${{ inputs.precision }} ${{ inputs.run-eval && 'eval ' || '' }}tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} conc=${{ inputs.conc }}"
     steps:
       - name: Resource cleanup
         run: |

From e07eb697aa94d7f1d6d12d787f77d6bf00318867 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 15 Dec 2025 13:05:52 -0600
Subject: [PATCH 179/214] getting rid of full sweep sched changes

---
 .github/workflows/full-sweep-1k1k-scheduler.yml | 12 ++++++++----
 .github/workflows/full-sweep-1k8k-scheduler.yml | 12 ++++++++----
 .github/workflows/full-sweep-8k1k-scheduler.yml | 12 ++++++++----
 3 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/full-sweep-1k1k-scheduler.yml b/.github/workflows/full-sweep-1k1k-scheduler.yml
index 945adeaa3..a8b40214e 100644
--- a/.github/workflows/full-sweep-1k1k-scheduler.yml
+++ b/.github/workflows/full-sweep-1k1k-scheduler.yml
@@ -16,8 +16,10 @@ jobs:
             - id: get-dsr1-configs
               run: |
                   pip install pydantic
-                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1 --run-evals)
-                  echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
+                  CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                  CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                  echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT
+                  echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT
 
     get-gptoss-configs:
         runs-on: ubuntu-latest
@@ -31,8 +33,10 @@ jobs:
             - id: get-gptoss-configs
               run: |
                   pip install pydantic
-                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss --run-evals)
-                  echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
+                  CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k1k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                  CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k1k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                  echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT
+                  echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT
 
     benchmark-dsr1-multi-node:
         needs: get-dsr1-configs
diff --git a/.github/workflows/full-sweep-1k8k-scheduler.yml b/.github/workflows/full-sweep-1k8k-scheduler.yml
index 10b137c88..062f00265 100644
--- a/.github/workflows/full-sweep-1k8k-scheduler.yml
+++ b/.github/workflows/full-sweep-1k8k-scheduler.yml
@@ -16,8 +16,10 @@ jobs:
             - id: get-dsr1-configs
               run: |
                   pip install pydantic
-                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1 --run-evals)
-                  echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
+                  CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k8k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                  CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k8k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                  echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT
+                  echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT
 
     get-gptoss-configs:
         runs-on: ubuntu-latest
@@ -31,8 +33,10 @@ jobs:
             - id: get-gptoss-configs
               run: |
                   pip install pydantic
-                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss --run-evals)
-                  echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
+                  CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k8k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                  CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k8k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                  echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT
+                  echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT
 
     benchmark-dsr1-multi-node:
         needs: get-dsr1-configs
diff --git a/.github/workflows/full-sweep-8k1k-scheduler.yml b/.github/workflows/full-sweep-8k1k-scheduler.yml
index d9fd2fd77..2b45b9679 100644
--- a/.github/workflows/full-sweep-8k1k-scheduler.yml
+++ b/.github/workflows/full-sweep-8k1k-scheduler.yml
@@ -16,8 +16,10 @@ jobs:
             - id: get-dsr1-configs
               run: |
                   pip install pydantic
-                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1 --run-evals)
-                  echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
+                  CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 8k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                  CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 8k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                  echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT
+                  echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT
 
     get-gptoss-configs:
         runs-on: ubuntu-latest
@@ -31,8 +33,10 @@ jobs:
             - id: get-gptoss-configs
               run: |
                   pip install pydantic
-                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss --run-evals)
-                  echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
+                  CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 8k1k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                  CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 8k1k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                  echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT
+                  echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT
 
     benchmark-dsr1-multi-node:
         needs: get-dsr1-configs

From 9275f0d8aedd1dc1fa4f281c816b1923c5e836b7 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 15 Dec 2025 13:13:54 -0600
Subject: [PATCH 180/214] add back spec decoding and disagg env vars

---
 .github/workflows/benchmark-tmpl.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index e10a040bc..284443961 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -75,6 +75,8 @@ env:
   EP_SIZE: ${{ inputs.ep }}
   DP_ATTENTION: ${{ inputs.dp-attn }}
   CONC: ${{ inputs.conc }}
+  SPEC_DECODING: ${{ inputs.spec-decoding }}
+  DISAGG: ${{ inputs.disagg }}
   RUN_EVAL: ${{ inputs.run-eval }}
 
 permissions:

From dba25aa5f0a03b9bcde87a00eb6b5661d7603d54 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 16 Dec 2025 10:48:00 -0600
Subject: [PATCH 181/214] add an option to ONLY run evals

---
 utils/matrix_logic/generate_sweep_configs.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py
index 7ca0f2996..b172c2efd 100644
--- a/utils/matrix_logic/generate_sweep_configs.py
+++ b/utils/matrix_logic/generate_sweep_configs.py
@@ -567,11 +567,16 @@ def main():
         required=True,
         help='Configuration file holding runner information (YAML format)'
     )
-    parent_parser.add_argument(
+    eval_group = parent_parser.add_mutually_exclusive_group()
+    eval_group.add_argument(
         '--run-evals',
         action='store_true',
-        required=False,
-        help='When specified, run evals on a subset of configs.'
+        help='When specified, run evals on a subset of configs (in addition to all configs).'
+    )
+    eval_group.add_argument(
+        '--evals-only',
+        action='store_true',
+        help='When specified, run ONLY the eval subset (excludes non-eval configs).'
     )
 
     # Create main parser
@@ -736,9 +741,12 @@ def main():
     else:
         parser.error(f"Unknown command: {args.command}")
         
-    # Choose eval (opt-in via --run-evals)
-    if args.run_evals:
+    # Handle eval options (mutually exclusive)
+    if args.run_evals or args.evals_only:
         matrix_values = mark_eval_entries(matrix_values)
+        # IF --evals-only is specified, filter to only eval entries
+        if args.evals_only:
+            matrix_values = [e for e in matrix_values if e.get(Fields.FIELD_RUN_EVAL.value, False)]
 
     print(json.dumps(matrix_values))
     return matrix_values

From 5de917bcf1f62814d0cdaebf6c059049d5b3d045 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 16 Dec 2025 11:28:46 -0600
Subject: [PATCH 182/214] remove full-sweep-test workflow and add collect-evals
 job to run sweep and e2e test

---
 .github/workflows/collect-evals.yml   |  12 +-
 .github/workflows/e2e-tests.yml       |   8 +-
 .github/workflows/full-sweep-test.yml | 503 --------------------------
 .github/workflows/run-sweep.yml       |  17 +
 4 files changed, 30 insertions(+), 510 deletions(-)
 delete mode 100644 .github/workflows/full-sweep-test.yml

diff --git a/.github/workflows/collect-evals.yml b/.github/workflows/collect-evals.yml
index c45842ef2..64bf603e8 100644
--- a/.github/workflows/collect-evals.yml
+++ b/.github/workflows/collect-evals.yml
@@ -3,7 +3,7 @@ name: Template - Collect Evals
 on:
   workflow_call:
     inputs:
-      exp-name:
+      result-prefix:
         required: false
         type: string
         default: ''
@@ -25,19 +25,19 @@ jobs:
         uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
         with:
           path: eval_results/
-          pattern: ${{ inputs.exp-name && format('eval_{0}_*', inputs.exp-name) || 'eval_*' }}
+          pattern: ${{ inputs.result-prefix && format('eval_{0}_*', inputs.result-prefix) || 'eval_*' }}
 
       - name: Summarize evals
         run: |
-          echo "## Eval Summary - ${{ inputs.exp-name || 'all' }}" >> $GITHUB_STEP_SUMMARY
+          echo "## Eval Summary - ${{ inputs.result-prefix || 'all' }}" >> $GITHUB_STEP_SUMMARY
           echo "" >> $GITHUB_STEP_SUMMARY
-          python3 utils/collect_eval_results.py eval_results/ ${{ inputs.exp-name || 'all' }} >> $GITHUB_STEP_SUMMARY
+          python3 utils/collect_eval_results.py eval_results/ ${{ inputs.result-prefix || 'all' }} >> $GITHUB_STEP_SUMMARY
 
       - name: Upload aggregated evals
         uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         with:
-          name: eval_results_${{ inputs.exp-name || 'all' }}
-          path: agg_eval_${{ inputs.exp-name || 'all' }}.json
+          name: eval_results_${{ inputs.result-prefix || 'all' }}
+          path: agg_eval_${{ inputs.result-prefix || 'all' }}.json
 
       - name: Cleanup downloaded eval artifacts
         if: ${{ always() }}
diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml
index 208f635d8..11057abcd 100644
--- a/.github/workflows/e2e-tests.yml
+++ b/.github/workflows/e2e-tests.yml
@@ -105,8 +105,14 @@ jobs:
         uses: ./.github/workflows/collect-results.yml
         secrets: inherit
 
+    collect-evals:
+        needs: [test-sweep-multi-node, test-sweep-single-node]
+        if: ${{ always() }}
+        uses: ./.github/workflows/collect-evals.yml
+        secrets: inherit
+
     calc-success-rate:
-        needs: collect-results
+        needs: [collect-results, collect-evals]
         if: ${{ always() }}
         runs-on: ubuntu-latest
 
diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml
deleted file mode 100644
index 3ba954838..000000000
--- a/.github/workflows/full-sweep-test.yml
+++ /dev/null
@@ -1,503 +0,0 @@
-name: Test - Full Sweep
-
-on:
-    workflow_dispatch:
-        inputs:
-            run_1k1k:
-                type: boolean
-                required: false
-            run_8k1k:
-                type: boolean
-                required: false
-            run_1k8k:
-                type: boolean
-                required: false
-            use_h100:
-                type: boolean
-                required: false
-            use_h200:
-                type: boolean
-                required: false
-            use_b200:
-                type: boolean
-                required: false
-            use_mi300x:
-                type: boolean
-                required: false
-            use_mi325x:
-                type: boolean
-                required: false
-            use_mi355x:
-                type: boolean
-                required: false
-            use_gb200:
-                type: boolean
-                required: false
-
-jobs:
-    get-configs:
-        runs-on: ubuntu-latest
-        outputs:
-            dsr1-1k1k: ${{ steps.generate-configs.outputs.dsr1-1k1k }}
-            dsr1-1k8k: ${{ steps.generate-configs.outputs.dsr1-1k8k }}
-            dsr1-8k1k: ${{ steps.generate-configs.outputs.dsr1-8k1k }}
-            gptoss-1k1k: ${{ steps.generate-configs.outputs.gptoss-1k1k }}
-            gptoss-1k8k: ${{ steps.generate-configs.outputs.gptoss-1k8k }}
-            gptoss-8k1k: ${{ steps.generate-configs.outputs.gptoss-8k1k }}
-        steps:
-            - name: Checkout code
-              uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
-
-              # This looks complicated, but it is just calling generate_sweep_configs.py conditioned on 
-              # discrete inputs (i.e., run_1k1k, run_h100, etc.) to split the test sweep into discrete jobs 
-            - id: generate-configs
-              run: |
-                  pip install pydantic
-
-                  set -x
-                  # Build runner type filters based on inputs
-                  RUNNER_TYPES="${{ inputs.use_h100 && 'h100' || '' }} ${{ inputs.use_h200 && 'h200' || '' }} ${{ inputs.use_b200 && 'b200 b200-trt' || '' }} ${{ inputs.use_mi300x && 'mi300x' || '' }} ${{ inputs.use_mi325x && 'mi325x' || '' }} ${{ inputs.use_mi355x && 'mi355x' || '' }}"
-
-                  # DSR1 doesn't support H100, so exclude it
-                  DSR1_RUNNER_TYPES=$(echo $RUNNER_TYPES | sed 's/\bh100\b//g' | xargs)
-
-                  # Generate dsr1 configs (only if we have valid runner types for DSR1)
-                  if [ "${{ inputs.run_1k1k }}" = "true" ] && [ -n "$DSR1_RUNNER_TYPES" ]; then
-                      DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml --run-evals)
-                      echo "dsr1-1k1k=$DSR1_1K1K" >> $GITHUB_OUTPUT
-                  else
-                      echo "dsr1-1k1k=[]" >> $GITHUB_OUTPUT
-                  fi
-
-                  if [ "${{ inputs.run_1k8k }}" = "true" ] && [ -n "$DSR1_RUNNER_TYPES" ]; then
-                      DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml --run-evals)
-                      echo "dsr1-1k8k=$DSR1_1K8K" >> $GITHUB_OUTPUT
-                  else
-                      echo "dsr1-1k8k=[]" >> $GITHUB_OUTPUT
-                  fi
-
-                  if [ "${{ inputs.run_8k1k }}" = "true" ] && [ -n "$DSR1_RUNNER_TYPES" ]; then
-                      DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml --run-evals)
-                      echo "dsr1-8k1k=$DSR1_8K1K" >> $GITHUB_OUTPUT
-                  else
-                      echo "dsr1-8k1k=[]" >> $GITHUB_OUTPUT
-                  fi
-
-                  # Generate gptoss configs (only if we have runner types selected)
-                  if [ "${{ inputs.run_1k1k }}" = "true" ] && [ -n "$RUNNER_TYPES" ]; then
-                      GPTOSS_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml --run-evals)
-                      echo "gptoss-1k1k=$GPTOSS_1K1K" >> $GITHUB_OUTPUT
-                  else
-                      echo "gptoss-1k1k=[]" >> $GITHUB_OUTPUT
-                  fi
-
-                  if [ "${{ inputs.run_1k8k }}" = "true" ] && [ -n "$RUNNER_TYPES" ]; then
-                      GPTOSS_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml --run-evals)
-                      echo "gptoss-1k8k=$GPTOSS_1K8K" >> $GITHUB_OUTPUT
-                  else
-                      echo "gptoss-1k8k=[]" >> $GITHUB_OUTPUT
-                  fi
-
-                  if [ "${{ inputs.run_8k1k }}" = "true" ] && [ -n "$RUNNER_TYPES" ]; then
-                      GPTOSS_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml --run-evals)
-                      echo "gptoss-8k1k=$GPTOSS_8K1K" >> $GITHUB_OUTPUT
-                  else
-                      echo "gptoss-8k1k=[]" >> $GITHUB_OUTPUT
-                  fi
-
-    # DSR1 1K1K Benchmarks
-    benchmark-dsr1-1k1k:
-        needs: get-configs
-        if: ${{ needs.get-configs.outputs.dsr1-1k1k != '[]' }}
-        uses: ./.github/workflows/benchmark-tmpl.yml
-        name: dsr1 1k1k /
-        strategy:
-            fail-fast: false
-            matrix:
-                config: ${{ fromJson(needs.get-configs.outputs.dsr1-1k1k) }}
-        secrets: inherit
-        with:
-            exp-name: ${{ matrix.config.exp-name }}
-            isl: ${{ matrix.config.isl }}
-            osl: ${{ matrix.config.osl }}
-            max-model-len: ${{ matrix.config.max-model-len }}
-            runner: ${{ matrix.config.runner }}
-            image: ${{ matrix.config.image }}
-            model: ${{ matrix.config.model }}
-            framework: ${{ matrix.config.framework }}
-            precision: ${{ matrix.config.precision }}
-            tp: ${{ matrix.config.tp }}
-            ep: ${{ matrix.config.ep }}
-            dp-attn: ${{ matrix.config.dp-attn }}
-            conc: ${{ matrix.config.conc }}
-            run-eval: ${{ matrix.config.run-eval }}
-
-    collect-dsr1-1k1k-results:
-        needs: benchmark-dsr1-1k1k
-        if: ${{ always() && needs.get-configs.outputs.dsr1-1k1k != '[]' }}
-        uses: ./.github/workflows/collect-results.yml
-        secrets: inherit
-        with:
-            exp-name: "dsr1_1k1k"
-
-    collect-dsr1-1k1k-evals:
-        needs: benchmark-dsr1-1k1k
-        if: ${{ always() && needs.get-configs.outputs.dsr1-1k1k != '[]' }}
-        uses: ./.github/workflows/collect-evals.yml
-        secrets: inherit
-        with:
-            exp-name: "dsr1_1k1k"
-
-    # GPTOSS 1K1K Benchmarks
-    benchmark-gptoss-1k1k:
-        needs: get-configs
-        if: ${{ needs.get-configs.outputs.gptoss-1k1k != '[]' }}
-        uses: ./.github/workflows/benchmark-tmpl.yml
-        name: gptoss 1k1k /
-        strategy:
-            fail-fast: false
-            matrix:
-                config: ${{ fromJson(needs.get-configs.outputs.gptoss-1k1k) }}
-        secrets: inherit
-        with:
-            exp-name: ${{ matrix.config.exp-name }}
-            isl: ${{ matrix.config.isl }}
-            osl: ${{ matrix.config.osl }}
-            max-model-len: ${{ matrix.config.max-model-len }}
-            runner: ${{ matrix.config.runner }}
-            image: ${{ matrix.config.image }}
-            model: ${{ matrix.config.model }}
-            framework: ${{ matrix.config.framework }}
-            precision: ${{ matrix.config.precision }}
-            tp: ${{ matrix.config.tp }}
-            ep: ${{ matrix.config.ep }}
-            dp-attn: ${{ matrix.config.dp-attn }}
-            conc: ${{ matrix.config.conc }}
-            run-eval: ${{ matrix.config.run-eval }}
-
-    collect-gptoss-1k1k-results:
-        needs: benchmark-gptoss-1k1k
-        if: ${{ always() && needs.get-configs.outputs.gptoss-1k1k != '[]' }}
-        uses: ./.github/workflows/collect-results.yml
-        secrets: inherit
-        with:
-            exp-name: "gptoss_1k1k"
-
-    collect-gptoss-1k1k-evals:
-        needs: benchmark-gptoss-1k1k
-        if: ${{ always() && needs.get-configs.outputs.gptoss-1k1k != '[]' }}
-        uses: ./.github/workflows/collect-evals.yml
-        secrets: inherit
-        with:
-            exp-name: "gptoss_1k1k"
-
-
-    # DSR1 8K1K Benchmarks
-    benchmark-dsr1-8k1k:
-        needs: get-configs
-        if: ${{ needs.get-configs.outputs.dsr1-8k1k != '[]' }}
-        uses: ./.github/workflows/benchmark-tmpl.yml
-        name: dsr1 8k1k /
-        strategy:
-            fail-fast: false
-            matrix:
-                config: ${{ fromJson(needs.get-configs.outputs.dsr1-8k1k) }}
-        secrets: inherit
-        with:
-            exp-name: ${{ matrix.config.exp-name }}
-            isl: ${{ matrix.config.isl }}
-            osl: ${{ matrix.config.osl }}
-            max-model-len: ${{ matrix.config.max-model-len }}
-            runner: ${{ matrix.config.runner }}
-            image: ${{ matrix.config.image }}
-            model: ${{ matrix.config.model }}
-            framework: ${{ matrix.config.framework }}
-            precision: ${{ matrix.config.precision }}
-            tp: ${{ matrix.config.tp }}
-            ep: ${{ matrix.config.ep }}
-            dp-attn: ${{ matrix.config.dp-attn }}
-            conc: ${{ matrix.config.conc }}
-            run-eval: ${{ matrix.config.run-eval }}
-
-    collect-dsr1-8k1k-results:
-        needs: benchmark-dsr1-8k1k
-        if: ${{ always() && needs.get-configs.outputs.dsr1-8k1k != '[]' }}
-        uses: ./.github/workflows/collect-results.yml
-        secrets: inherit
-        with:
-            exp-name: "dsr1_8k1k"
-
-    collect-dsr1-8k1k-evals:
-        needs: benchmark-dsr1-8k1k
-        if: ${{ always() && needs.get-configs.outputs.dsr1-8k1k != '[]' }}
-        uses: ./.github/workflows/collect-evals.yml
-        secrets: inherit
-        with:
-            exp-name: "dsr1_8k1k"
-
-    # GPTOSS 8K1K Benchmarks
-    benchmark-gptoss-8k1k:
-        needs: get-configs
-        if: ${{ needs.get-configs.outputs.gptoss-8k1k != '[]' }}
-        uses: ./.github/workflows/benchmark-tmpl.yml
-        name: gptoss 8k1k /
-        strategy:
-            fail-fast: false
-            matrix:
-                config: ${{ fromJson(needs.get-configs.outputs.gptoss-8k1k) }}
-        secrets: inherit
-        with:
-            exp-name: ${{ matrix.config.exp-name }}
-            isl: ${{ matrix.config.isl }}
-            osl: ${{ matrix.config.osl }}
-            max-model-len: ${{ matrix.config.max-model-len }}
-            runner: ${{ matrix.config.runner }}
-            image: ${{ matrix.config.image }}
-            model: ${{ matrix.config.model }}
-            framework: ${{ matrix.config.framework }}
-            precision: ${{ matrix.config.precision }}
-            tp: ${{ matrix.config.tp }}
-            ep: ${{ matrix.config.ep }}
-            dp-attn: ${{ matrix.config.dp-attn }}
-            conc: ${{ matrix.config.conc }}
-            run-eval: ${{ matrix.config.run-eval }}
-
-    collect-gptoss-8k1k-results:
-        needs: benchmark-gptoss-8k1k
-        if: ${{ always() && needs.get-configs.outputs.gptoss-8k1k != '[]' }}
-        uses: ./.github/workflows/collect-results.yml
-        secrets: inherit
-        with:
-            exp-name: "gptoss_8k1k"
-
-    collect-gptoss-8k1k-evals:
-        needs: benchmark-gptoss-8k1k
-        if: ${{ always() && needs.get-configs.outputs.gptoss-8k1k != '[]' }}
-        uses: ./.github/workflows/collect-evals.yml
-        secrets: inherit
-        with:
-            exp-name: "gptoss_8k1k"
-
-
-    # DSR1 1K8K Benchmarks
-    benchmark-dsr1-1k8k:
-        needs: get-configs
-        if: ${{ needs.get-configs.outputs.dsr1-1k8k != '[]' }}
-        uses: ./.github/workflows/benchmark-tmpl.yml
-        name: dsr1 1k8k /
-        strategy:
-            fail-fast: false
-            matrix:
-                config: ${{ fromJson(needs.get-configs.outputs.dsr1-1k8k) }}
-        secrets: inherit
-        with:
-            exp-name: ${{ matrix.config.exp-name }}
-            isl: ${{ matrix.config.isl }}
-            osl: ${{ matrix.config.osl }}
-            max-model-len: ${{ matrix.config.max-model-len }}
-            runner: ${{ matrix.config.runner }}
-            image: ${{ matrix.config.image }}
-            model: ${{ matrix.config.model }}
-            framework: ${{ matrix.config.framework }}
-            precision: ${{ matrix.config.precision }}
-            tp: ${{ matrix.config.tp }}
-            ep: ${{ matrix.config.ep }}
-            dp-attn: ${{ matrix.config.dp-attn }}
-            conc: ${{ matrix.config.conc }}
-            run-eval: ${{ matrix.config.run-eval }}
-
-    # This is a workaround until we can integrate GB200 into master configs.
-    benchmark-gb200-1k1k:
-        if: ${{ inputs.use_gb200 && inputs.run_1k1k }}
-        uses: ./.github/workflows/benchmark-multinode-tmpl.yml
-        name: gb200 1k1k sweep
-        strategy:
-            fail-fast: false
-            matrix:
-                config: &dsr1_static_configs
-                    - {
-                          "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3",
-                          "model": "deepseek-r1-fp4",
-                          "model-prefix": "dsr1",
-                          "precision": "fp4",
-                          "framework": "dynamo-trtllm",
-                          "mtp": "off",
-                      }
-                    - {
-                          "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3",
-                          "model": "deepseek-r1-fp4",
-                          "model-prefix": "dsr1",
-                          "precision": "fp4",
-                          "framework": "dynamo-trtllm",
-                          "mtp": "on",
-                      }
-                    - {
-                          "image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1",
-                          "model": "deepseek-ai/DeepSeek-R1-0528",
-                          "model-prefix": "dsr1",
-                          "precision": "fp8",
-                          "framework": "dynamo-sglang",
-                          "mtp": "off",
-                      }
-        secrets: inherit
-        with:
-            runner: gb200
-            image: ${{ matrix.config.image }}
-            model: ${{ matrix.config.model }}
-            framework: ${{ matrix.config.framework }}
-            precision: ${{ matrix.config.precision }}
-            exp-name: ${{ matrix.config.model-prefix }}_1k1k
-            isl: 1024
-            osl: 1024
-            max-model-len: 2048
-            mtp-mode: ${{ matrix.config.mtp }}
-
-    benchmark-gb200-1k8k:
-        if: ${{ inputs.use_gb200 && inputs.run_1k8k }}
-        uses: ./.github/workflows/benchmark-multinode-tmpl.yml
-        name: gb200 1k8k sweep
-        strategy:
-            fail-fast: false
-            matrix:
-                config: *dsr1_static_configs
-        secrets: inherit
-        with:
-            runner: gb200
-            image: ${{ matrix.config.image }}
-            model: ${{ matrix.config.model }}
-            framework: ${{ matrix.config.framework }}
-            precision: ${{ matrix.config.precision }}
-            exp-name: ${{ matrix.config.model-prefix }}_1k8k
-            isl: 1024
-            osl: 8192
-            max-model-len: 9216
-            mtp-mode: ${{ matrix.config.mtp }}
-
-    benchmark-gb200-8k1k:
-        if: ${{ inputs.use_gb200 && inputs.run_8k1k }}
-        uses: ./.github/workflows/benchmark-multinode-tmpl.yml
-        name: gb200 8k1k sweep
-        strategy:
-            fail-fast: false
-            matrix:
-                config: *dsr1_static_configs
-        secrets: inherit
-        with:
-            runner: gb200
-            image: ${{ matrix.config.image }}
-            model: ${{ matrix.config.model }}
-            framework: ${{ matrix.config.framework }}
-            precision: ${{ matrix.config.precision }}
-            exp-name: ${{ matrix.config.model-prefix }}_8k1k
-            isl: 1024
-            osl: 8192
-            max-model-len: 9216
-            mtp-mode: ${{ matrix.config.mtp }}
-
-    collect-dsr1-1k8k-results:
-        needs:
-            [
-                benchmark-dsr1-1k8k,
-                benchmark-gb200-1k1k,
-                benchmark-gb200-1k8k,
-                benchmark-gb200-8k1k,
-            ]
-        if: ${{ always() && needs.get-configs.outputs.dsr1-1k8k != '[]' }}
-        uses: ./.github/workflows/collect-results.yml
-        secrets: inherit
-        with:
-            exp-name: "dsr1_1k8k"
-
-    collect-dsr1-1k8k-evals:
-        needs: benchmark-dsr1-1k8k
-        if: ${{ always() && needs.get-configs.outputs.dsr1-1k8k != '[]' }}
-        uses: ./.github/workflows/collect-evals.yml
-        secrets: inherit
-        with:
-            exp-name: "dsr1_1k8k"
-
-
-    # GPTOSS 1K8K Benchmarks
-    benchmark-gptoss-1k8k:
-        needs: get-configs
-        if: ${{ needs.get-configs.outputs.gptoss-1k8k != '[]' }}
-        uses: ./.github/workflows/benchmark-tmpl.yml
-        name: gptoss 1k8k /
-        strategy:
-            fail-fast: false
-            matrix:
-                config: ${{ fromJson(needs.get-configs.outputs.gptoss-1k8k) }}
-        secrets: inherit
-        with:
-            exp-name: ${{ matrix.config.exp-name }}
-            isl: ${{ matrix.config.isl }}
-            osl: ${{ matrix.config.osl }}
-            max-model-len: ${{ matrix.config.max-model-len }}
-            runner: ${{ matrix.config.runner }}
-            image: ${{ matrix.config.image }}
-            model: ${{ matrix.config.model }}
-            framework: ${{ matrix.config.framework }}
-            precision: ${{ matrix.config.precision }}
-            tp: ${{ matrix.config.tp }}
-            ep: ${{ matrix.config.ep }}
-            dp-attn: ${{ matrix.config.dp-attn }}
-            conc: ${{ matrix.config.conc }}
-            run-eval: ${{ matrix.config.run-eval }}
-
-    collect-gptoss-1k8k-results:
-        needs: benchmark-gptoss-1k8k
-        if: ${{ always() && needs.get-configs.outputs.gptoss-1k8k != '[]' }}
-        uses: ./.github/workflows/collect-results.yml
-        secrets: inherit
-        with:
-            exp-name: "gptoss_1k8k"
-
-    collect-gptoss-1k8k-evals:
-        needs: benchmark-gptoss-1k8k
-        if: ${{ always() && needs.get-configs.outputs.gptoss-1k8k != '[]' }}
-        uses: ./.github/workflows/collect-evals.yml
-        secrets: inherit
-        with:
-            exp-name: "gptoss_1k8k"
-
-
-    calc-success-rate:
-        needs:
-            [
-                collect-dsr1-1k1k-results,
-                collect-dsr1-1k8k-results,
-                collect-dsr1-8k1k-results,
-                collect-gptoss-1k1k-results,
-                collect-gptoss-1k8k-results,
-                collect-gptoss-8k1k-results,
-            ]
-        if: ${{ always() }}
-        runs-on: ubuntu-latest
-
-        env:
-            RESULTS_DIR: "results/"
-            STATS_FILENAME: "run_stats"
-            GITHUB_TOKEN: ${{ secrets.REPO_PAT }}
-
-        steps:
-            - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
-              with:
-                  token: ${{ secrets.REPO_PAT }}
-                  fetch-depth: 0
-
-            - name: Download results artifacts
-              uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
-              with:
-                  path: ${{ env.RESULTS_DIR }}
-                  pattern: results_*
-
-            - name: Install python dependencies
-              run: pip install PyGithub
-
-            - name: Calculate success rate
-              run: python3 utils/calc_success_rate.py $STATS_FILENAME
-
-            - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
-              with:
-                  name: "run-stats"
-                  path: ${{ env.STATS_FILENAME }}.json
diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml
index 6a459de40..adcfd3dbf 100644
--- a/.github/workflows/run-sweep.yml
+++ b/.github/workflows/run-sweep.yml
@@ -185,6 +185,23 @@ jobs:
         with:
             result-prefix: "bmk"
 
+    collect-evals:
+        needs:
+            [
+                sweep-single-node-1k1k,
+                sweep-single-node-1k8k,
+                sweep-single-node-8k1k,
+                sweep-multi-node-1k1k,
+                sweep-multi-node-1k8k,
+                sweep-multi-node-8k1k,
+                setup,
+            ]
+        if: ${{ always() && needs.setup.result != 'skipped' }}
+        uses: ./.github/workflows/collect-evals.yml
+        secrets: inherit
+        with:
+            result-prefix: "bmk"
+
     upload-changelog-metadata:
         needs: [setup, collect-results]
         if: ${{ always() && needs.setup.result != 'skipped' }}

From 37d05d32d4c302bac0a4a8b0f843bc76feaacc52 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 16 Dec 2025 11:53:52 -0600
Subject: [PATCH 183/214] add run-eval to e2e tests

---
 .github/workflows/benchmark-tmpl.yml         | 7 +++----
 .github/workflows/e2e-tests.yml              | 1 +
 utils/matrix_logic/generate_sweep_configs.py | 4 ++--
 utils/matrix_logic/validation.py             | 6 +++---
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 284443961..08a4b5fef 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -50,14 +50,13 @@ on:
       disagg:
         required: true
         type: string
+      run-eval:
+        type: boolean
+        required: true
       random-range-ratio:
         required: false
         type: string
         default: '0.8'
-      run-eval:
-        type: boolean
-        required: false
-        default: false
 env:
   HF_TOKEN: ${{ secrets.HF_TOKEN }}
   HF_HUB_CACHE: '/mnt/hf_hub_cache/'
diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml
index 11057abcd..1fca38d1b 100644
--- a/.github/workflows/e2e-tests.yml
+++ b/.github/workflows/e2e-tests.yml
@@ -98,6 +98,7 @@ jobs:
             conc: ${{ matrix.config.conc }}
             spec-decoding: ${{ matrix.config.spec-decoding }}
             disagg: ${{ matrix.config.disagg }}
+            run-eval: ${{ matrix.config.run-eval }}
 
     collect-results:
         needs: [test-sweep-multi-node, test-sweep-single-node]
diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py
index b172c2efd..db6826079 100644
--- a/utils/matrix_logic/generate_sweep_configs.py
+++ b/utils/matrix_logic/generate_sweep_configs.py
@@ -83,7 +83,7 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]:
 
     # Mark the selected entries
     for i, entry in enumerate(matrix_values):
-        entry[Fields.FIELD_RUN_EVAL.value] = i in eval_indices
+        entry[Fields.RUN_EVAL.value] = i in eval_indices
 
     return matrix_values
 
@@ -746,7 +746,7 @@ def main():
         matrix_values = mark_eval_entries(matrix_values)
         # IF --evals-only is specified, filter to only eval entries
         if args.evals_only:
-            matrix_values = [e for e in matrix_values if e.get(Fields.FIELD_RUN_EVAL.value, False)]
+            matrix_values = [e for e in matrix_values if e.get(Fields.RUN_EVAL.value, False)]
 
     print(json.dumps(matrix_values))
     return matrix_values
diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py
index 4d79d27a9..424763914 100644
--- a/utils/matrix_logic/validation.py
+++ b/utils/matrix_logic/validation.py
@@ -52,7 +52,7 @@ class Fields(Enum):
     DISAGG = 'disagg'
 
     # Eval
-    FIELD_RUN_EVAL = 'run-eval'
+    RUN_EVAL = 'run-eval'
 
 
 """
@@ -88,7 +88,7 @@ class SingleNodeMatrixEntry(BaseModel):
     max_model_len: int = Field(alias=Fields.MAX_MODEL_LEN.value)
     exp_name: str = Field(alias=Fields.EXP_NAME.value)
     disagg: bool
-    run_eval: bool = Field(alias='run-eval', default=False)
+    run_eval: bool = Field(alias=Fields.RUN_EVAL.value, default=False)
 
 
 class WorkerConfig(BaseModel):
@@ -125,7 +125,7 @@ class MultiNodeMatrixEntry(BaseModel):
     max_model_len: int = Field(alias=Fields.MAX_MODEL_LEN.value)
     exp_name: str = Field(alias=Fields.EXP_NAME.value)
     disagg: bool
-    run_eval: bool = Field(alias='run-eval', default=False)
+    run_eval: bool = Field(alias=Fields.RUN_EVAL.value, default=False)
 
 
 def validate_matrix_entry(entry: dict, is_multinode: bool) -> dict:

From 6a546e531cc3fb9d866f1cb14ca752290894cf0a Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Tue, 16 Dec 2025 10:01:11 -0800
Subject: [PATCH 184/214] math500 prompt and h200 trt evals

---
 benchmarks/gptoss_fp4_h200_trt_slurm.sh | 7 +++++++
 utils/evals/math500.yaml                | 2 +-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh
index 12a6af5b7..26043d322 100644
--- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh
+++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh
@@ -74,3 +74,10 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    append_lm_eval_summary
+fi
+set +x
\ No newline at end of file
diff --git a/utils/evals/math500.yaml b/utils/evals/math500.yaml
index 09051d118..2e172e7f0 100644
--- a/utils/evals/math500.yaml
+++ b/utils/evals/math500.yaml
@@ -9,7 +9,7 @@ dataset_name: algebra
 output_type: generate_until
 training_split: train
 test_split: test
-doc_to_text: "You are solving competition math problems.\n\nFormat rules:\n- Answer in a new. That line must start with `Answer: ` (capital A, colon, one space).\n- After `Answer: `, write ONLY the answer as inline LaTeX.\n- Use ONLY ASCII LaTeX commands (e.g. \\pi, \\frac{1}{2}, -). NO Unicode symbols.\n- Do NOT wrap the answer in $, $$, \\( \\), \\[ \\], or any other delimiters.\n- Do NOT use \\displaystyle or any display-style commands. Answer only this problem, the rest are examples. Problem: {{problem}}\n"
+doc_to_text: "You are solving competition math problems.\n\nFormat rules:\n- Answer in a new line that starts with `Answer: `.\n- After `Answer: `, write ONLY the answer as inline LaTeX.\n- Use ONLY ASCII LaTeX commands (e.g. \\pi, \\frac{1}{2}, -). NO Unicode symbols.\n- Do NOT wrap the answer in $, $$, \\( \\), \\[ \\], or any other delimiters.\n- Do NOT use \\displaystyle or any display-style commands. Answer only this problem, the rest are examples. Problem: {{problem}}\n"
 process_results: !function utils.process_results
 doc_to_target: "{{answer}}"
 generation_kwargs:

From d299d417de828c09d272fce529f2df8dd5aaffef Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 16 Dec 2025 13:14:57 -0600
Subject: [PATCH 185/214] remove run prefix

---
 .github/workflows/e2e-tests.yml | 2 ++
 .github/workflows/run-sweep.yml | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml
index 1fca38d1b..7e128253d 100644
--- a/.github/workflows/e2e-tests.yml
+++ b/.github/workflows/e2e-tests.yml
@@ -105,6 +105,8 @@ jobs:
         if: ${{ always() }}
         uses: ./.github/workflows/collect-results.yml
         secrets: inherit
+        with:
+            result-prefix: "bmk"
 
     collect-evals:
         needs: [test-sweep-multi-node, test-sweep-single-node]
diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml
index adcfd3dbf..224bae7f9 100644
--- a/.github/workflows/run-sweep.yml
+++ b/.github/workflows/run-sweep.yml
@@ -199,8 +199,6 @@ jobs:
         if: ${{ always() && needs.setup.result != 'skipped' }}
         uses: ./.github/workflows/collect-evals.yml
         secrets: inherit
-        with:
-            result-prefix: "bmk"
 
     upload-changelog-metadata:
         needs: [setup, collect-results]

From 569d0c3607b28f42189b20508f4d354fa99664c3 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 16 Dec 2025 13:54:10 -0600
Subject: [PATCH 186/214] add result-prefix to benchmark tmpl uploaded
 artifacts

---
 .github/workflows/benchmark-tmpl.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 08a4b5fef..6f2dead4f 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -173,10 +173,11 @@ jobs:
           RUNNER_TYPE: ${{ inputs.runner }}
         run: |
           python3 utils/process_result.py
+
       - name: Upload result
         uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
         with:
-          name: ${{ env.RESULT_FILENAME }}
+          name: bmk_${{ env.RESULT_FILENAME }}
           path: agg_${{ env.RESULT_FILENAME }}.json
 
       - name: Upload eval results (if any)

From 30a3431bbc23fe4a40558dcf7dadc4ce6b634a9f Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Tue, 16 Dec 2025 20:35:05 -0800
Subject: [PATCH 187/214] Evals summary refactor

---
 utils/collect_eval_results.py | 369 +++++++++++++---------------------
 utils/summarize.py            | 204 +++++++++++--------
 2 files changed, 257 insertions(+), 316 deletions(-)

diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py
index 4f6f0dd30..bb089d519 100644
--- a/utils/collect_eval_results.py
+++ b/utils/collect_eval_results.py
@@ -1,235 +1,138 @@
 #!/usr/bin/env python3
-import os
 import sys
 import json
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
+from tabulate import tabulate
+
+# Import shared utilities from summarize
+sys.path.insert(0, str(Path(__file__).resolve().parent))
+from summarize import (
+    load_json, MODEL, HARDWARE, FRAMEWORK, PRECISION, 
+    TP, EP, CONC, DP_ATTENTION, TASK, EM_STRICT, EM_FLEXIBLE, N_EFF
+)
 
 
 def find_eval_sets(root: Path) -> List[Path]:
     """Return directories that contain a meta_env.json (one set per job).
-
-    New structure: each downloaded artifact is placed under
-    eval_results/<artifact-name>/ with flat files inside, e.g.:
-      - meta_env.json
-      - results_*.json
-
-    We first check immediate child directories for meta_env.json to avoid
-    descending unnecessarily. If nothing is found (backward compatibility),
-    fall back to recursive search.
+    
+    Structure: eval_results/<artifact-name>/meta_env.json
     """
     out: List[Path] = []
-    # Prefer immediate children (one directory per artifact)
     try:
         for d in root.iterdir():
             if d.is_dir() and (d / 'meta_env.json').exists():
                 out.append(d)
     except Exception:
         pass
-    if out:
-        return out
-    # Fallback: recursive (legacy structure)
-    for p in root.rglob('meta_env.json'):
-        out.append(p.parent)
     return out
 
 
-def load_json(path: Path) -> Optional[Dict[str, Any]]:
-    try:
-        with open(path, 'r') as f:
-            return json.load(f)
-    except Exception:
-        return None
-
-
 def detect_eval_jsons(d: Path) -> Tuple[Optional[Path], Optional[Path]]:
-    """Return (lm_eval_json, lighteval_json) if present (latest by mtime).
-
-    New structure places result JSONs flat in the artifact directory. We
-    first check only the immediate directory for JSONs, then fall back to
-    recursive search for backward compatibility.
+    """Return (lm_eval_json, lighteval_json) if present.
+    
+    Checks immediate directory for result JSONs.
     """
-    def scan_jsons(paths: List[Path]) -> Tuple[List[Tuple[float, Path]], List[Tuple[float, Path]]]:
-        """Classify JSON files into lm-eval vs lighteval buckets.
-
-        Returns two lists of (mtime, path) where:
-          - The first list contains candidates that look like lm-eval outputs.
-          - The second list contains candidates that look like lighteval outputs.
-
-        Heuristics used (order matters):
-          - If a JSON has keys like 'lm_eval_version' or 'pretty_env_info',
-            we treat it as an lm-eval result file.
-          - If it has both 'config_general' and 'results', we treat it as
-            a lighteval result file.
-          - If it only has a top-level 'results' but none of the stronger
-            signals above, we fall back to classifying it as lm-eval.
-
-        We keep the file modification time to later choose the most recent
-        candidate; if obtaining mtime fails, we fall back to 0.
-        """
-        lm: List[Tuple[float, Path]] = []
-        le: List[Tuple[float, Path]] = []
-        for p in paths:
-            if p.name == 'meta_env.json':
-                continue
-            data = load_json(p)
-            if not isinstance(data, dict):
-                continue
-            if 'lm_eval_version' in data or 'pretty_env_info' in data:
-                # lm-eval harness output
-                try:
-                    lm.append((p.stat().st_mtime, p))
-                except Exception:
-                    lm.append((0, p))
-            elif 'config_general' in data and 'results' in data:
-                # lighteval output structure
-                try:
-                    le.append((p.stat().st_mtime, p))
-                except Exception:
-                    le.append((0, p))
-            elif 'results' in data:
-                # Fallback: treat as lm-eval JSON
-                try:
-                    lm.append((p.stat().st_mtime, p))
-                except Exception:
-                    lm.append((0, p))
-        return lm, le
-
-    # 1) Prefer immediate JSONs (flat structure)
-    immediate_jsons = list(d.glob('results*.json')) + [p for p in d.glob('*.json') if p.name != 'meta_env.json']
-    lm, le = scan_jsons(immediate_jsons)
-
-    # 2) If nothing found, fallback to deep scan (legacy)
-    if not lm and not le:
-        deep_jsons = list(d.rglob('*.json'))
-        lm, le = scan_jsons(deep_jsons)
-
-    lm_path = sorted(lm, key=lambda x: x[0])[-1][1] if lm else None
-    le_path = sorted(le, key=lambda x: x[0])[-1][1] if le else None
+    immediate_jsons = list(d.glob('results*.json')) + [
+        p for p in d.glob('*.json') if p.name != 'meta_env.json'
+    ]
+    
+    lm_path = None
+    le_path = None
+    
+    for p in immediate_jsons:
+        data = load_json(p)
+        if not isinstance(data, dict):
+            continue
+            
+        if 'lm_eval_version' in data:
+            # lm-eval harness - pick latest if multiple
+            if lm_path is None or p.stat().st_mtime > lm_path.stat().st_mtime:
+                lm_path = p
+        elif 'config_general' in data and 'results' in data:
+            # lighteval - pick latest if multiple
+            if le_path is None or p.stat().st_mtime > le_path.stat().st_mtime:
+                le_path = p
+                
     return lm_path, le_path
 
 
-def extract_lm_metrics(json_path: Path, task: Optional[str] = None) -> Dict[str, Any]:
+def extract_lm_metrics(json_path: Path) -> Dict[str, Any]:
+    """Extract metrics from lm-eval harness result JSON.
+    
+    Uses explicit structure from the JSON file:
+    - Task name from results keys
+    - Metric name from configs.metric_list
+    - Filter names from configs.filter_list
+    - Values from results[task][metric,filter]
+    """
     data = load_json(json_path) or {}
-    results = data.get('results') or {}
-    # Determine task key robustly:
-    # 1) explicit argument
-    # 2) only key in `results`
-    # 3) only key in `configs`
-    # 4) 'unknown'
-    t = task
-    if not t:
-        if isinstance(results, dict) and len(results) == 1:
-            t = next(iter(results.keys()))
-        else:
-            cfgs = data.get('configs') or {}
-            if isinstance(cfgs, dict) and len(cfgs) == 1:
-                t = next(iter(cfgs.keys()))
-            else:
-                # fallback to arbitrary but stable choice
-                t = next(iter(results.keys()), 'unknown') if isinstance(results, dict) else 'unknown'
-
-    res = results.get(t, {}) if isinstance(results, dict) else {}
-
-    # Determine base metric name (e.g., 'exact_match')
-    base_metric: Optional[str] = None
-    hib = (data.get('higher_is_better') or {}).get(t) if isinstance(data.get('higher_is_better'), dict) else None
-    if isinstance(hib, dict) and hib:
-        base_metric = next(iter(hib.keys()))
-    if not base_metric:
-        cfg = (data.get('configs') or {}).get(t, {}) if isinstance(data.get('configs'), dict) else {}
-        ml = cfg.get('metric_list') if isinstance(cfg, dict) else None
-        if isinstance(ml, list) and ml:
-            m0 = ml[0] or {}
-            if isinstance(m0, dict):
-                base_metric = m0.get('metric')
-    if not base_metric:
-        # Fallback: infer from result keys
-        if isinstance(res, dict):
-            for k in res.keys():
-                if isinstance(k, str) and ',' in k:
-                    base_metric = k.split(',', 1)[0]
-                    break
-            if not base_metric and 'exact_match' in res:
-                base_metric = 'exact_match'
-    if not base_metric:
-        base_metric = 'exact_match'
-
-    # Determine filter names and map to strict/flexible logically without guessing
-    strict_name: Optional[str] = None
-    flex_name: Optional[str] = None
-    cfg = (data.get('configs') or {}).get(t, {}) if isinstance(data.get('configs'), dict) else {}
-    fl = cfg.get('filter_list') if isinstance(cfg, dict) else None
-    filter_names: List[str] = []
-    if isinstance(fl, list):
-        for it in fl:
-            if isinstance(it, dict):
-                nm = it.get('name')
-                if isinstance(nm, str):
-                    filter_names.append(nm)
-    # Prefer semantic names when present; otherwise preserve file order
-    for nm in filter_names:
-        if strict_name is None and 'strict' in nm.lower():
-            strict_name = nm
-        if flex_name is None and ('flex' in nm.lower() or 'extract' in nm.lower()):
-            flex_name = nm
-    # Fallback to first/second if semantic match not found
-    if not strict_name and filter_names:
-        strict_name = filter_names[0]
-    if not flex_name and len(filter_names) >= 2:
-        flex_name = filter_names[1]
-
-    # Extract metrics present in results using derived keys
-    def get_pair(fname: Optional[str]) -> Tuple[Optional[float], Optional[float]]:
-        if not fname:
-            # try unfiltered key
-            v = res.get(base_metric)
-            se = res.get(f"{base_metric}_stderr")
-            try:
-                return float(v) if v is not None else None, float(se) if se is not None else None
-            except Exception:
-                return v, se
-        v = res.get(f"{base_metric},{fname}")
-        se = res.get(f"{base_metric}_stderr,{fname}")
-        try:
-            return float(v) if v is not None else None, float(se) if se is not None else None
-        except Exception:
-            return v, se
-
-    strict, strict_se = get_pair(strict_name)
-    flex, flex_se = get_pair(flex_name)
-
-    n_eff = None
-    ns = data.get('n-samples') or data.get('n_samples') or {}
-    if isinstance(ns, dict):
-        td = ns.get(t) or {}
-        if isinstance(td, dict):
-            n_eff = td.get('effective') or td.get('n_eff')
-
+    results = data.get('results', {})
+    configs = data.get('configs', {})
+    
+    if not results:
+        return {}
+        
+    # 1. Task: first key from results
+    task = next(iter(results.keys()))
+    
+    # 2. Base metric: from config's metric_list
+    metric_list = configs.get(task, {}).get('metric_list', [])
+    base_metric = metric_list[0]['metric'] if metric_list else 'exact_match'
+    
+    # 3. Filters: from config's filter_list
+    filter_list = configs.get(task, {}).get('filter_list', [])
+    
+    strict_val, strict_se = None, None
+    flex_val, flex_se = None, None
+    
+    # Helper to get value/stderr pair for filtered metrics
+    def get_val_se(filter_name: str) -> Tuple[Optional[float], Optional[float]]:
+        val_key = f"{base_metric},{filter_name}"
+        se_key = f"{base_metric}_stderr,{filter_name}"
+        return results[task].get(val_key), results[task].get(se_key)
+
+    # Extract metrics based on filter_list
+    if not filter_list:
+        # No filters - use base metric for strict
+        strict_val = results[task].get(base_metric)
+        strict_se = results[task].get(f"{base_metric}_stderr")
+    else:
+        # Extract metrics for each filter
+        for f in filter_list:
+            fname = f['name']
+            if 'strict' in fname:
+                strict_val, strict_se = get_val_se(fname)
+            elif 'flex' in fname or 'extract' in fname:
+                flex_val, flex_se = get_val_se(fname)
+
+    # N-samples (effective count)
+    n_eff = data.get('n-samples', {}).get(task, {}).get('effective')
+    
+    # Model name
     model = (
-        data.get('model_name')
-        or (data.get('configs', {}).get(t, {}) or {}).get('metadata', {}).get('model')
-        or (data.get('config') or {}).get('model')
-        or ''
+        data.get('model_name') 
+        or configs.get(task, {}).get('metadata', {}).get('model')
     )
 
     return {
-        'task': t,
-        'strict': strict,
-        'flex': flex,
+        'task': task,
+        'strict': strict_val,
         'strict_se': strict_se,
+        'flex': flex_val,
         'flex_se': flex_se,
         'n_eff': n_eff,
-        'hardware': 'Unknown GPU',
         'model': model,
         'source': str(json_path)
     }
 
 
 def extract_lighteval_metrics(json_path: Path, task_base: Optional[str] = None) -> Dict[str, Any]:
+    """Extract metrics from lighteval result JSON."""
     data = load_json(json_path) or {}
     results = data.get('results', {}) or {}
-    # Choose a task key starting with task_base if provided, else 'all', else first key
+    
+    # Find task key
     key = None
     if task_base:
         for k in results.keys():
@@ -237,12 +140,12 @@ def extract_lighteval_metrics(json_path: Path, task_base: Optional[str] = None)
                 key = k
                 break
     if key is None:
-        key = 'all' if 'all' in results else (next(iter(results.keys())) if results else 'unknown')
-    r = results.get(key, {}) if isinstance(results, dict) else {}
+        key = next(iter(results.keys())) if results else 'unknown'
+        
+    r = results.get(key, {})
     em = r.get('extractive_match')
     em_se = r.get('extractive_match_stderr')
 
-    model = ''
     cg = data.get('config_general', {}) or {}
     model = cg.get('model_name') or cg.get('model_config', {}).get('model_name', '')
 
@@ -253,13 +156,13 @@ def extract_lighteval_metrics(json_path: Path, task_base: Optional[str] = None)
         'strict_se': em_se,
         'flex_se': None,
         'n_eff': None,
-        'hardware': 'Unknown GPU',
         'model': model,
         'source': str(json_path)
     }
 
 
 def pct(x: Any) -> str:
+    """Format value as percentage."""
     try:
         return f"{float(x)*100:.2f}%"
     except Exception:
@@ -267,6 +170,7 @@ def pct(x: Any) -> str:
 
 
 def se(x: Any) -> str:
+    """Format stderr as percentage with ± prefix."""
     try:
         return f" ±{float(x)*100:.2f}%"
     except Exception:
@@ -279,13 +183,14 @@ def main():
         sys.exit(1)
 
     root = Path(sys.argv[1])
-    exp_name = sys.argv[2] or 'all'
+    exp_name = sys.argv[2]
 
     rows: List[Dict[str, Any]] = []
     for d in find_eval_sets(root):
         meta = load_json(d / 'meta_env.json') or {}
         lm_path, le_path = detect_eval_jsons(d)
-        # Prefer lm-eval when available, else lighteval
+        
+        # Extract metrics (prefer lm-eval)
         if lm_path:
             m = extract_lm_metrics(lm_path)
         elif le_path:
@@ -293,16 +198,20 @@ def main():
         else:
             continue
 
+        if not m:
+            continue
+
+        # Build row from meta + metrics
         row = {
-            'model': m.get('model') or meta.get('model') or 'unknown',
-            'hw': (meta.get('hw') or 'unknown').upper(),
-            'framework': (meta.get('framework') or 'unknown').lower(),
-            'precision': (meta.get('precision') or 'unknown').lower(),
-            'tp': int(meta.get('tp') or 1),
-            'ep': int(meta.get('ep') or 1),
-            'conc': int(meta.get('conc') or 0),
-            'dp_attention': str(meta.get('dp_attention') or 'false'),
-            'task': m.get('task') or 'unknown',
+            'model': m.get('model') or meta.get('model', 'unknown'),
+            'hw': meta.get('hw', 'unknown').upper(),
+            'framework': meta.get('framework', 'unknown').lower(),
+            'precision': meta.get('precision', 'unknown').lower(),
+            'tp': int(meta.get('tp', 1)),
+            'ep': int(meta.get('ep', 1)),
+            'conc': int(meta.get('conc', 0)),
+            'dp_attention': str(meta.get('dp_attention', False)).lower(),
+            'task': m.get('task', 'unknown'),
             'em_strict': m.get('strict'),
             'em_strict_se': m.get('strict_se'),
             'em_flexible': m.get('flex'),
@@ -314,31 +223,37 @@ def main():
 
     # Sort for stable output
     rows.sort(key=lambda r: (
-        r.get('hw',''), r.get('framework',''),
-        r.get('precision',''), r.get('tp',0), r.get('conc',0)
+        r['hw'], r['framework'], r['precision'], r['tp'], r['conc']
     ))
 
     if not rows:
         print('> No eval results found to summarize.')
     else:
-        # Print Markdown summary table
-        print('| Model | Hardware | Framework | Precision | TP | EP | Conc | DPA | Task | EM Strict | EM Flexible | N (eff) |')
-        print('| :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |')
-        for r in rows:
-            print(
-                f"| {r['model']} "
-                f"| {r['hw']} "
-                f"| {r['framework'].upper()} "
-                f"| {r['precision'].upper()} "
-                f"| {r['tp']} "
-                f"| {r['ep']} "
-                f"| {r['conc']} "
-                f"| {r['dp_attention']} "
-                f"| {r['task']} "
-                f"| {pct(r['em_strict'])}{se(r['em_strict_se'])} "
-                f"| {pct(r['em_flexible'])}{se(r['em_flexible_se'])} "
-                f"| {r['n_eff'] or ''} |"
-            )
+        # Print table using tabulate
+        headers = [
+            MODEL, HARDWARE, FRAMEWORK, PRECISION, TP, EP, CONC, DP_ATTENTION, 
+            TASK, EM_STRICT, EM_FLEXIBLE, N_EFF
+        ]
+        
+        table_rows = [
+            [
+                r['model'],
+                r['hw'],
+                r['framework'].upper(),
+                r['precision'].upper(),
+                r['tp'],
+                r['ep'],
+                r['conc'],
+                r['dp_attention'],
+                r['task'],
+                f"{pct(r['em_strict'])}{se(r['em_strict_se'])}",
+                f"{pct(r['em_flexible'])}{se(r['em_flexible_se'])}",
+                r['n_eff'] or ''
+            ]
+            for r in rows
+        ]
+        
+        print(tabulate(table_rows, headers=headers, tablefmt="github"))
 
     # Write JSON aggregate
     out_path = Path(f'agg_eval_{exp_name}.json')
@@ -347,4 +262,4 @@ def main():
 
 
 if __name__ == '__main__':
-    main()
+    main()
\ No newline at end of file
diff --git a/utils/summarize.py b/utils/summarize.py
index a46c2e02a..c40754ab7 100644
--- a/utils/summarize.py
+++ b/utils/summarize.py
@@ -1,6 +1,7 @@
 import sys
 import json
 from pathlib import Path
+from typing import Any, Dict, Optional
 from tabulate import tabulate
 
 # Header constants
@@ -33,95 +34,120 @@
 DECODE_WORKERS = "Decode Workers"
 DECODE_GPUS = "Decode GPUs"
 
-results = []
-results_dir = Path(sys.argv[1])
-for result_path in results_dir.rglob('*.json'):
-    with open(result_path) as f:
-        result = json.load(f)
-    results.append(result)
-
-single_node_results = [r for r in results if not r['is_multinode']]
-multinode_results = [r for r in results if r['is_multinode']]
-
-# Single-node and multi-node results have different fields and therefore need to be printed separately
-if single_node_results:
-    single_node_results.sort(key=lambda r: (
-        r['infmax_model_prefix'], r['hw'], r['framework'], r['precision'], r['isl'], r['osl'], r['tp'], r['ep'], r['conc']))
-
-    single_node_headers = [
-        MODEL, SERVED_MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL, TP, EP, DP_ATTENTION,
-        CONC, TTFT, TPOT, INTERACTIVITY, E2EL, TPUT_PER_GPU, OUTPUT_TPUT_PER_GPU, INPUT_TPUT_PER_GPU
-    ]
-
-    single_node_rows = [
-        [
-            r['infmax_model_prefix'],
-            r['model'],
-            r['hw'].upper(),
-            r['framework'].upper(),
-            r['precision'].upper(),
-            r['isl'],
-            r['osl'],
-            r['tp'],
-            r['ep'],
-            r['dp_attention'],
-            r['conc'],
-            f"{r['median_ttft'] * 1000:.4f}",
-            f"{r['median_tpot'] * 1000:.4f}",
-            f"{r['median_intvty']:.4f}",
-            f"{r['median_e2el']:.4f}",
-            f"{r['tput_per_gpu']:.4f}",
-            f"{r['output_tput_per_gpu']:.4f}",
-            f"{r['input_tput_per_gpu']:.4f}",
+# Eval constants
+TASK = "Task"
+EM_STRICT = "EM Strict"
+EM_FLEXIBLE = "EM Flexible"
+N_EFF = "N (eff)"
+
+
+def load_json(path: Path) -> Optional[Dict[str, Any]]:
+    """Load JSON file and return dict, or None on error."""
+    try:
+        with open(path, 'r') as f:
+            return json.load(f)
+    except Exception:
+        return None
+
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python summarize.py <results_dir>")
+        sys.exit(1)
+
+    results = []
+    results_dir = Path(sys.argv[1])
+    for result_path in results_dir.rglob('*.json'):
+        result = load_json(result_path)
+        if result and 'is_multinode' in result:
+            results.append(result)
+
+    single_node_results = [r for r in results if not r['is_multinode']]
+    multinode_results = [r for r in results if r['is_multinode']]
+
+    # Single-node and multi-node results have different fields and therefore need to be printed separately
+    if single_node_results:
+        single_node_results.sort(key=lambda r: (
+            r['infmax_model_prefix'], r['hw'], r['framework'], r['precision'], r['isl'], r['osl'], r['tp'], r['ep'], r['conc']))
+
+        single_node_headers = [
+            MODEL, SERVED_MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL, TP, EP, DP_ATTENTION,
+            CONC, TTFT, TPOT, INTERACTIVITY, E2EL, TPUT_PER_GPU, OUTPUT_TPUT_PER_GPU, INPUT_TPUT_PER_GPU
         ]
-        for r in single_node_results
-    ]
-
-    print("## Single-Node Results\n")
-    print(tabulate(single_node_rows, headers=single_node_headers, tablefmt="github"))
-    print("\n")
-
-if multinode_results:
-    multinode_results.sort(key=lambda r: (r['infmax_model_prefix'], r['hw'], r['framework'], r['precision'], r['isl'],
-                           r['osl'], r['prefill_tp'], r['prefill_ep'], r['decode_tp'], r['decode_ep'], r['conc']))
-
-    multinode_headers = [
-        MODEL, SERVED_MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL,
-        PREFILL_TP, PREFILL_EP, PREFILL_DP_ATTN, PREFILL_WORKERS, PREFILL_GPUS,
-        DECODE_TP, DECODE_EP, DECODE_DP_ATTN, DECODE_WORKERS, DECODE_GPUS,
-        CONC, TTFT, TPOT, INTERACTIVITY, E2EL, TPUT_PER_GPU, OUTPUT_TPUT_PER_GPU, INPUT_TPUT_PER_GPU
-    ]
-
-    multinode_rows = [
-        [
-            r['infmax_model_prefix'],
-            r['model'],
-            r['hw'].upper(),
-            r['framework'].upper(),
-            r['precision'].upper(),
-            r['isl'],
-            r['osl'],
-            r['prefill_tp'],
-            r['prefill_ep'],
-            r['prefill_dp_attention'],
-            r['prefill_num_workers'],
-            r['num_prefill_gpu'],
-            r['decode_tp'],
-            r['decode_ep'],
-            r['decode_dp_attention'],
-            r['decode_num_workers'],
-            r['num_decode_gpu'],
-            r['conc'],
-            f"{r['median_ttft'] * 1000:.4f}",
-            f"{r['median_tpot'] * 1000:.4f}",
-            f"{r['median_intvty']:.4f}",
-            f"{r['median_e2el']:.4f}",
-            f"{r['tput_per_gpu']:.4f}",
-            f"{r['output_tput_per_gpu']:.4f}",
-            f"{r['input_tput_per_gpu']:.4f}",
+
+        single_node_rows = [
+            [
+                r['infmax_model_prefix'],
+                r['model'],
+                r['hw'].upper(),
+                r['framework'].upper(),
+                r['precision'].upper(),
+                r['isl'],
+                r['osl'],
+                r['tp'],
+                r['ep'],
+                r['dp_attention'],
+                r['conc'],
+                f"{r['median_ttft'] * 1000:.4f}",
+                f"{r['median_tpot'] * 1000:.4f}",
+                f"{r['median_intvty']:.4f}",
+                f"{r['median_e2el']:.4f}",
+                f"{r['tput_per_gpu']:.4f}",
+                f"{r['output_tput_per_gpu']:.4f}",
+                f"{r['input_tput_per_gpu']:.4f}",
+            ]
+            for r in single_node_results
         ]
-        for r in multinode_results
-    ]
 
-    print("## Multi-Node Results\n")
-    print(tabulate(multinode_rows, headers=multinode_headers, tablefmt="github"))
+        print("## Single-Node Results\n")
+        print(tabulate(single_node_rows, headers=single_node_headers, tablefmt="github"))
+        print("\n")
+
+    if multinode_results:
+        multinode_results.sort(key=lambda r: (r['infmax_model_prefix'], r['hw'], r['framework'], r['precision'], r['isl'],
+                            r['osl'], r['prefill_tp'], r['prefill_ep'], r['decode_tp'], r['decode_ep'], r['conc']))
+
+        multinode_headers = [
+            MODEL, SERVED_MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL,
+            PREFILL_TP, PREFILL_EP, PREFILL_DP_ATTN, PREFILL_WORKERS, PREFILL_GPUS,
+            DECODE_TP, DECODE_EP, DECODE_DP_ATTN, DECODE_WORKERS, DECODE_GPUS,
+            CONC, TTFT, TPOT, INTERACTIVITY, E2EL, TPUT_PER_GPU, OUTPUT_TPUT_PER_GPU, INPUT_TPUT_PER_GPU
+        ]
+
+        multinode_rows = [
+            [
+                r['infmax_model_prefix'],
+                r['model'],
+                r['hw'].upper(),
+                r['framework'].upper(),
+                r['precision'].upper(),
+                r['isl'],
+                r['osl'],
+                r['prefill_tp'],
+                r['prefill_ep'],
+                r['prefill_dp_attention'],
+                r['prefill_num_workers'],
+                r['num_prefill_gpu'],
+                r['decode_tp'],
+                r['decode_ep'],
+                r['decode_dp_attention'],
+                r['decode_num_workers'],
+                r['num_decode_gpu'],
+                r['conc'],
+                f"{r['median_ttft'] * 1000:.4f}",
+                f"{r['median_tpot'] * 1000:.4f}",
+                f"{r['median_intvty']:.4f}",
+                f"{r['median_e2el']:.4f}",
+                f"{r['tput_per_gpu']:.4f}",
+                f"{r['output_tput_per_gpu']:.4f}",
+                f"{r['input_tput_per_gpu']:.4f}",
+            ]
+            for r in multinode_results
+        ]
+
+        print("## Multi-Node Results\n")
+        print(tabulate(multinode_rows, headers=multinode_headers, tablefmt="github"))
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From 22c8a2bd9202d5ee69f4679b1eda40ac7d9937d3 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Tue, 16 Dec 2025 21:43:56 -0800
Subject: [PATCH 188/214] Evals summary refactor 2

---
 .github/workflows/collect-evals.yml   | 1 +
 .github/workflows/collect-results.yml | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/collect-evals.yml b/.github/workflows/collect-evals.yml
index 64bf603e8..4d6288be6 100644
--- a/.github/workflows/collect-evals.yml
+++ b/.github/workflows/collect-evals.yml
@@ -29,6 +29,7 @@ jobs:
 
       - name: Summarize evals
         run: |
+          pip install tabulate
           echo "## Eval Summary - ${{ inputs.result-prefix || 'all' }}" >> $GITHUB_STEP_SUMMARY
           echo "" >> $GITHUB_STEP_SUMMARY
           python3 utils/collect_eval_results.py eval_results/ ${{ inputs.result-prefix || 'all' }} >> $GITHUB_STEP_SUMMARY
diff --git a/.github/workflows/collect-results.yml b/.github/workflows/collect-results.yml
index ccc2ce4e4..5bfbde52e 100644
--- a/.github/workflows/collect-results.yml
+++ b/.github/workflows/collect-results.yml
@@ -34,7 +34,9 @@ jobs:
           python3 utils/summarize.py results/ >> $GITHUB_STEP_SUMMARY
 
       - name: Aggregate results
-        run: python3 utils/collect_results.py results/ ${{ inputs.result-prefix || 'all' }}
+        run: |
+          pip install tabulate
+          python3 utils/collect_results.py results/ ${{ inputs.result-prefix || 'all' }}
 
       - name: Upload aggregated results
         uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0

From 8d12b35afc0b733c27fe3a0988ecba65bea4ed71 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Tue, 16 Dec 2025 22:16:40 -0800
Subject: [PATCH 189/214] Evals summary aesthetics

---
 .github/workflows/collect-evals.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/collect-evals.yml b/.github/workflows/collect-evals.yml
index 4d6288be6..606117e79 100644
--- a/.github/workflows/collect-evals.yml
+++ b/.github/workflows/collect-evals.yml
@@ -30,7 +30,7 @@ jobs:
       - name: Summarize evals
         run: |
           pip install tabulate
-          echo "## Eval Summary - ${{ inputs.result-prefix || 'all' }}" >> $GITHUB_STEP_SUMMARY
+          echo "## Eval Summary" >> $GITHUB_STEP_SUMMARY
           echo "" >> $GITHUB_STEP_SUMMARY
           python3 utils/collect_eval_results.py eval_results/ ${{ inputs.result-prefix || 'all' }} >> $GITHUB_STEP_SUMMARY
 

From d7a515a55c999603352013b28026272dec34979d Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Wed, 17 Dec 2025 18:18:35 -0800
Subject: [PATCH 190/214] TRT package fix, trt testing

---
 benchmarks/dsr1_fp8_h200_trt_slurm.sh   | 3 ++-
 benchmarks/gptoss_fp4_h200_trt_slurm.sh | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
index e71cc8b0d..3a0498d6a 100644
--- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
@@ -54,7 +54,8 @@ fi
 
 set -x
 
-MAX_NUM_TOKENS=$(( ($CONC+$ISL+64+63)/64*64 ))
+MAX_NUM_TOKENS=$(( (CONC + ISL + 64 + 63) / 64 * 64 ))
+MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 4096 ? MAX_MODEL_LEN : 4096 ))
 
 # Launch TRT-LLM server
 PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \
diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh
index 26043d322..964fd0352 100644
--- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh
+++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh
@@ -40,7 +40,7 @@ print_iter_log: true
 stream_interval: 20 
 EOF
 
-mpirun -n 1 --oversubscribe --allow-run-as-root \
+PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \
 trtllm-serve $MODEL \
 --max_batch_size $CONC \
 --max_num_tokens 20000 \

From 25f71bd1997ebbbb5de418017f73942161690959 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Wed, 17 Dec 2025 18:42:58 -0800
Subject: [PATCH 191/214] trt testing 2

---
 benchmarks/dsr1_fp4_b200_trt_slurm.sh | 1 +
 benchmarks/dsr1_fp8_b200_trt_slurm.sh | 1 +
 2 files changed, 2 insertions(+)

diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
index f4165b72a..e72d4dcd1 100644
--- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
@@ -85,6 +85,7 @@ fi
 set -x
 
 MAX_NUM_TOKENS=$(( ($CONC+$ISL+64+63)/64*64 ))
+MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 4096 ? MAX_MODEL_LEN : 4096 ))
 
 # Launch TRT-LLM server
 mpirun -n 1 --oversubscribe --allow-run-as-root \
diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh
index c77f5277f..f9ab48a10 100644
--- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh
@@ -55,6 +55,7 @@ fi
 set -x
 
 MAX_NUM_TOKENS=$(( ($CONC+$ISL+64+63)/64*64 ))
+MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 4096 ? MAX_MODEL_LEN : 4096 ))
 
 # Launch TRT-LLM server
 mpirun -n 1 --oversubscribe --allow-run-as-root \

From ab6bf8f18de9063f58b113931ee4bec227e443f2 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Thu, 18 Dec 2025 21:40:32 -0800
Subject: [PATCH 192/214] max_num_tokens

---
 benchmarks/dsr1_fp4_b200_trt_slurm.sh | 1 +
 benchmarks/dsr1_fp8_b200_trt_slurm.sh | 1 +
 benchmarks/dsr1_fp8_h200_trt_slurm.sh | 1 +
 3 files changed, 3 insertions(+)

diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
index e72d4dcd1..a0902ad46 100644
--- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
@@ -86,6 +86,7 @@ set -x
 
 MAX_NUM_TOKENS=$(( ($CONC+$ISL+64+63)/64*64 ))
 MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 4096 ? MAX_MODEL_LEN : 4096 ))
+MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 4096 ? MAX_NUM_TOKENS : 4096 ))
 
 # Launch TRT-LLM server
 mpirun -n 1 --oversubscribe --allow-run-as-root \
diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh
index f9ab48a10..83d3c74a6 100644
--- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh
@@ -56,6 +56,7 @@ set -x
 
 MAX_NUM_TOKENS=$(( ($CONC+$ISL+64+63)/64*64 ))
 MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 4096 ? MAX_MODEL_LEN : 4096 ))
+MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 4096 ? MAX_NUM_TOKENS : 4096 ))
 
 # Launch TRT-LLM server
 mpirun -n 1 --oversubscribe --allow-run-as-root \
diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
index 3a0498d6a..a8ee33776 100644
--- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
@@ -56,6 +56,7 @@ set -x
 
 MAX_NUM_TOKENS=$(( (CONC + ISL + 64 + 63) / 64 * 64 ))
 MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 4096 ? MAX_MODEL_LEN : 4096 ))
+MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 4096 ? MAX_NUM_TOKENS : 4096 ))
 
 # Launch TRT-LLM server
 PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \

From 9a873c4b67493569f4a56791affc392aafeaa494 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Wed, 7 Jan 2026 20:11:54 -0800
Subject: [PATCH 193/214] unbounded gen len

---
 benchmarks/benchmark_lib.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 1e25a8421..8ad96299b 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -310,7 +310,7 @@ run_lm_eval() {
       --tasks "utils/evals/${task}.yaml" \
       --num_fewshot "${num_fewshot}" \
       --output_path "${results_dir}" \
-      --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=</s>,max_retries=2,num_concurrent=${concurrent_requests},tokenized_requests=False" \
+      --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=</s>,max_retries=2,num_concurrent=${concurrent_requests},tokenized_requests=False,max_length=32768" \
       --gen_kwargs "max_tokens=${gen_max_tokens},temperature=${temperature},top_p=${top_p}"
     set +x
 }

From 999b9f67d8c020ef5f53a8d006f82fcd7f5d59ce Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Wed, 7 Jan 2026 20:45:51 -0800
Subject: [PATCH 194/214] Fix tmpl args, add isl/osl to table

---
 .github/workflows/benchmark-tmpl.yml         |  1 +
 utils/collect_eval_results.py                | 12 ++++++++----
 utils/matrix_logic/generate_sweep_configs.py | 14 ++++++++++----
 3 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 976e1d08d..73feb7ee2 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -53,6 +53,7 @@ on:
       run-eval:
         type: boolean
         required: true
+        default: false
       random-range-ratio:
         required: false
         type: string
diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py
index bb089d519..3116406e8 100644
--- a/utils/collect_eval_results.py
+++ b/utils/collect_eval_results.py
@@ -8,7 +8,7 @@
 # Import shared utilities from summarize
 sys.path.insert(0, str(Path(__file__).resolve().parent))
 from summarize import (
-    load_json, MODEL, HARDWARE, FRAMEWORK, PRECISION, 
+    load_json, MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL,
     TP, EP, CONC, DP_ATTENTION, TASK, EM_STRICT, EM_FLEXIBLE, N_EFF
 )
 
@@ -207,6 +207,8 @@ def main():
             'hw': meta.get('hw', 'unknown').upper(),
             'framework': meta.get('framework', 'unknown').lower(),
             'precision': meta.get('precision', 'unknown').lower(),
+            'isl': int(meta.get('isl', 0)),
+            'osl': int(meta.get('osl', 0)),
             'tp': int(meta.get('tp', 1)),
             'ep': int(meta.get('ep', 1)),
             'conc': int(meta.get('conc', 0)),
@@ -223,7 +225,7 @@ def main():
 
     # Sort for stable output
     rows.sort(key=lambda r: (
-        r['hw'], r['framework'], r['precision'], r['tp'], r['conc']
+        r['hw'], r['framework'], r['precision'], r['isl'], r['osl'], r['tp'], r['ep'], r['conc']
     ))
 
     if not rows:
@@ -231,7 +233,7 @@ def main():
     else:
         # Print table using tabulate
         headers = [
-            MODEL, HARDWARE, FRAMEWORK, PRECISION, TP, EP, CONC, DP_ATTENTION, 
+            MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL, TP, EP, CONC, DP_ATTENTION, 
             TASK, EM_STRICT, EM_FLEXIBLE, N_EFF
         ]
         
@@ -241,6 +243,8 @@ def main():
                 r['hw'],
                 r['framework'].upper(),
                 r['precision'].upper(),
+                r['isl'],
+                r['osl'],
                 r['tp'],
                 r['ep'],
                 r['conc'],
@@ -262,4 +266,4 @@ def main():
 
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()
diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py
index db6826079..da81685a7 100644
--- a/utils/matrix_logic/generate_sweep_configs.py
+++ b/utils/matrix_logic/generate_sweep_configs.py
@@ -41,15 +41,21 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]:
     from collections import defaultdict
 
     # Group entries by (model, runner, framework, precision, isl, osl)
-    # This ensures we compare within the same configuration, not across different frameworks
+    # Only include entries that have a top-level TP (i.e., single-node schema).
+    # This avoids relying on structural hints like prefill/decode which may be
+    # reused by future single-node disaggregated modes.
     groups = defaultdict(list)
     for i, entry in enumerate(matrix_values):
+        # Skip entries without a top-level TP field
+        if Fields.TP.value not in entry:
+            continue
+
         key = (
-            entry[Fields.MODEL.value], 
-            entry[Fields.RUNNER.value], 
+            entry[Fields.MODEL.value],
+            entry[Fields.RUNNER.value],
             entry[Fields.FRAMEWORK.value],
             entry[Fields.PRECISION.value],
-            entry[Fields.ISL.value], 
+            entry[Fields.ISL.value],
             entry[Fields.OSL.value]
         )
         groups[key].append((i, entry))

From 9a132501360035dc54c8cd1dea148ba67979181d Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Wed, 7 Jan 2026 22:17:54 -0800
Subject: [PATCH 195/214] add isl/osl

---
 .github/workflows/benchmark-tmpl.yml     | 1 -
 benchmarks/benchmark_lib.sh              | 4 +++-
 benchmarks/dsr1_fp4_b200_slurm.sh        | 6 ++++++
 benchmarks/dsr1_fp8_b200_slurm.sh        | 9 ++++++++-
 benchmarks/gptoss_fp4_b200_slurm.sh      | 7 +++++++
 benchmarks/gptoss_fp4_b200_trt_docker.sh | 9 ++++++++-
 6 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 73feb7ee2..976e1d08d 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -53,7 +53,6 @@ on:
       run-eval:
         type: boolean
         required: true
-        default: false
       random-range-ratio:
         required: false
         type: string
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 8ad96299b..1c1fc5398 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -357,7 +357,9 @@ append_lm_eval_summary() {
   "ep": ${EP_SIZE:-1},
   "dp_attention": ${dp_json},
   "model": "${model_name:-}",
-  "hw": "${RUNNER_TYPE:-unknown}"
+  "hw": "${RUNNER_TYPE:-unknown}",
+  "isl": "${ISL:-0}",
+  "osl": "${OSL:-0}"
 }
 META
 
diff --git a/benchmarks/dsr1_fp4_b200_slurm.sh b/benchmarks/dsr1_fp4_b200_slurm.sh
index 570d39792..730404602 100644
--- a/benchmarks/dsr1_fp4_b200_slurm.sh
+++ b/benchmarks/dsr1_fp4_b200_slurm.sh
@@ -57,3 +57,9 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
 
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    append_lm_eval_summary
+fi
+set +x
diff --git a/benchmarks/dsr1_fp8_b200_slurm.sh b/benchmarks/dsr1_fp8_b200_slurm.sh
index 71532f816..e6d107661 100644
--- a/benchmarks/dsr1_fp8_b200_slurm.sh
+++ b/benchmarks/dsr1_fp8_b200_slurm.sh
@@ -57,4 +57,11 @@ run_benchmark_serving \
     --num-prompts $(( $CONC * 10 )) \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
-    --result-dir /workspace/
\ No newline at end of file
+    --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    append_lm_eval_summary
+fi
+set +x
diff --git a/benchmarks/gptoss_fp4_b200_slurm.sh b/benchmarks/gptoss_fp4_b200_slurm.sh
index 6890a2191..c790a9ca8 100644
--- a/benchmarks/gptoss_fp4_b200_slurm.sh
+++ b/benchmarks/gptoss_fp4_b200_slurm.sh
@@ -68,3 +68,10 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    append_lm_eval_summary
+fi
+set +x
diff --git a/benchmarks/gptoss_fp4_b200_trt_docker.sh b/benchmarks/gptoss_fp4_b200_trt_docker.sh
index 1f5fbe868..61ffe6318 100644
--- a/benchmarks/gptoss_fp4_b200_trt_docker.sh
+++ b/benchmarks/gptoss_fp4_b200_trt_docker.sh
@@ -87,4 +87,11 @@ run_benchmark_serving \
     --num-prompts "$NUM_PROMPTS" \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
-    --result-dir /workspace/
\ No newline at end of file
+    --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    append_lm_eval_summary
+fi
+set +x

From 4b0f8ded247ddebba343ca5367051fcaba1e3817 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Mon, 12 Jan 2026 13:21:16 -0800
Subject: [PATCH 196/214] set max tokens

---
 benchmarks/benchmark_lib.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 1c1fc5398..ffc92000a 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -275,7 +275,7 @@ run_lm_eval() {
     local task="${EVAL_TASK:-gsm8k}"
     local num_fewshot="${NUM_FEWSHOT:-2}"
     local results_dir="${EVAL_RESULT_DIR:-$(mktemp -d /tmp/eval_out-XXXXXX)}"
-    local gen_max_tokens=4096
+    local gen_max_tokens=16384
     local temperature=0
     local top_p=1
     local concurrent_requests=32
@@ -310,7 +310,7 @@ run_lm_eval() {
       --tasks "utils/evals/${task}.yaml" \
       --num_fewshot "${num_fewshot}" \
       --output_path "${results_dir}" \
-      --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=</s>,max_retries=2,num_concurrent=${concurrent_requests},tokenized_requests=False,max_length=32768" \
+      --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=</s>,max_retries=2,num_concurrent=${concurrent_requests},tokenized_requests=False,max_length=${gen_max_tokens}" \
       --gen_kwargs "max_tokens=${gen_max_tokens},temperature=${temperature},top_p=${top_p}"
     set +x
 }

From a52f4c6b2912b032d3adbba9025720efb5e3f037 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Mon, 12 Jan 2026 13:30:37 -0800
Subject: [PATCH 197/214] remove nvd

---
 .github/configs/runners.yaml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml
index e414af03e..458d2a7dc 100644
--- a/.github/configs/runners.yaml
+++ b/.github/configs/runners.yaml
@@ -21,10 +21,6 @@ b200-trt:
 - 'b200-nb_1'
 b200:
 # Docker-only nodes
-- 'b200-nvd_0'
-- 'b200-nvd_1'
-- 'b200-nvd_2'
-- 'b200-nvd_3'
 - 'b200-dgxc_1'
 - 'b200-dgxc_2'
 # Slurm nodes (also have b200 label, can run docker workloads)

From 568e1d3d408fb13c131ae0e09890e6679e2d8968 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Mon, 12 Jan 2026 16:25:33 -0800
Subject: [PATCH 198/214] In case of multiple evals

---
 .github/workflows/benchmark-tmpl.yml |   1 +
 benchmarks/benchmark_lib.sh          |   2 +-
 utils/collect_eval_results.py        | 270 +++++++++++++++------------
 utils/summarize.py                   |   1 +
 4 files changed, 158 insertions(+), 116 deletions(-)

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 976e1d08d..73feb7ee2 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -53,6 +53,7 @@ on:
       run-eval:
         type: boolean
         required: true
+        default: false
       random-range-ratio:
         required: false
         type: string
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index ffc92000a..27a3aea60 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -632,7 +632,7 @@ run_lighteval_eval() {
     _install_lighteval_deps
     _patch_lighteval_litellm
 
-    # Prefer OPENAI_MODEL_NAME, then EVAL_MODEL_NAME, then MODEL
+    # For lighteval, MODEL_NAME MUST BE SET
     local model_name="${MODEL_NAME}"
     if [[ -z "$model_name" ]]; then
         echo "Error: MODEL not set for lighteval." >&2
diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py
index 3116406e8..11752bef3 100644
--- a/utils/collect_eval_results.py
+++ b/utils/collect_eval_results.py
@@ -9,7 +9,7 @@
 sys.path.insert(0, str(Path(__file__).resolve().parent))
 from summarize import (
     load_json, MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL,
-    TP, EP, CONC, DP_ATTENTION, TASK, EM_STRICT, EM_FLEXIBLE, N_EFF
+    TP, EP, CONC, DP_ATTENTION, TASK, SCORE, EM_STRICT, EM_FLEXIBLE, N_EFF
 )
 
 
@@ -57,11 +57,13 @@ def detect_eval_jsons(d: Path) -> Tuple[Optional[Path], Optional[Path]]:
     return lm_path, le_path
 
 
-def extract_lm_metrics(json_path: Path) -> Dict[str, Any]:
+def extract_lm_metrics(json_path: Path) -> List[Dict[str, Any]]:
     """Extract metrics from lm-eval harness result JSON.
-    
+
+    Returns a list of metric dicts, one per task in the results.
+
     Uses explicit structure from the JSON file:
-    - Task name from results keys
+    - Task names from results keys
     - Metric name from configs.metric_list
     - Filter names from configs.filter_list
     - Values from results[task][metric,filter]
@@ -69,96 +71,111 @@ def extract_lm_metrics(json_path: Path) -> Dict[str, Any]:
     data = load_json(json_path) or {}
     results = data.get('results', {})
     configs = data.get('configs', {})
-    
+
     if not results:
-        return {}
-        
-    # 1. Task: first key from results
-    task = next(iter(results.keys()))
-    
-    # 2. Base metric: from config's metric_list
-    metric_list = configs.get(task, {}).get('metric_list', [])
-    base_metric = metric_list[0]['metric'] if metric_list else 'exact_match'
-    
-    # 3. Filters: from config's filter_list
-    filter_list = configs.get(task, {}).get('filter_list', [])
-    
-    strict_val, strict_se = None, None
-    flex_val, flex_se = None, None
-    
-    # Helper to get value/stderr pair for filtered metrics
-    def get_val_se(filter_name: str) -> Tuple[Optional[float], Optional[float]]:
-        val_key = f"{base_metric},{filter_name}"
-        se_key = f"{base_metric}_stderr,{filter_name}"
-        return results[task].get(val_key), results[task].get(se_key)
-
-    # Extract metrics based on filter_list
-    if not filter_list:
-        # No filters - use base metric for strict
-        strict_val = results[task].get(base_metric)
-        strict_se = results[task].get(f"{base_metric}_stderr")
-    else:
-        # Extract metrics for each filter
-        for f in filter_list:
-            fname = f['name']
-            if 'strict' in fname:
-                strict_val, strict_se = get_val_se(fname)
-            elif 'flex' in fname or 'extract' in fname:
-                flex_val, flex_se = get_val_se(fname)
-
-    # N-samples (effective count)
-    n_eff = data.get('n-samples', {}).get(task, {}).get('effective')
-    
-    # Model name
-    model = (
-        data.get('model_name') 
-        or configs.get(task, {}).get('metadata', {}).get('model')
-    )
-
-    return {
-        'task': task,
-        'strict': strict_val,
-        'strict_se': strict_se,
-        'flex': flex_val,
-        'flex_se': flex_se,
-        'n_eff': n_eff,
-        'model': model,
-        'source': str(json_path)
-    }
+        return []
+
+    extracted = []
+
+    for task in results.keys():
+        task_results = results[task]
+        task_config = configs.get(task, {})
+
+        # Base metric: from config's metric_list
+        metric_list = task_config.get('metric_list', [])
+        base_metric = metric_list[0]['metric'] if metric_list else 'exact_match'
+
+        # Filters: from config's filter_list
+        filter_list = task_config.get('filter_list', [])
+
+        strict_val, strict_se = None, None
+        flex_val, flex_se = None, None
+        accuracy_val, accuracy_se = None, None
+
+        # Helper to get value/stderr pair for filtered metrics
+        def get_val_se(filter_name: str) -> Tuple[Optional[float], Optional[float]]:
+            val_key = f"{base_metric},{filter_name}"
+            se_key = f"{base_metric}_stderr,{filter_name}"
+            return task_results.get(val_key), task_results.get(se_key)
+
+        # Extract metrics based on filter_list
+        if not filter_list:
+            # No filters - check for accuracy or use base metric
+            if 'acc' in task_results:
+                accuracy_val = task_results.get('acc')
+                accuracy_se = task_results.get('acc_stderr')
+            else:
+                strict_val = task_results.get(base_metric)
+                strict_se = task_results.get(f"{base_metric}_stderr")
+        else:
+            # Extract metrics for each filter
+            for f in filter_list:
+                fname = f['name']
+                if 'strict' in fname:
+                    strict_val, strict_se = get_val_se(fname)
+                elif 'flex' in fname or 'extract' in fname:
+                    flex_val, flex_se = get_val_se(fname)
+
+        # N-samples (effective count)
+        n_eff = data.get('n-samples', {}).get(task, {}).get('effective')
 
+        # Model name
+        model = (
+            data.get('model_name')
+            or task_config.get('metadata', {}).get('model')
+        )
 
-def extract_lighteval_metrics(json_path: Path, task_base: Optional[str] = None) -> Dict[str, Any]:
-    """Extract metrics from lighteval result JSON."""
+        extracted.append({
+            'task': task,
+            'strict': strict_val,
+            'strict_se': strict_se,
+            'flex': flex_val,
+            'flex_se': flex_se,
+            'accuracy': accuracy_val,
+            'accuracy_se': accuracy_se,
+            'n_eff': n_eff,
+            'model': model,
+            'source': str(json_path)
+        })
+
+    return extracted
+
+
+def extract_lighteval_metrics(json_path: Path) -> List[Dict[str, Any]]:
+    """Extract metrics from lighteval result JSON.
+
+    Returns a list of metric dicts, one per task in the results.
+    """
     data = load_json(json_path) or {}
     results = data.get('results', {}) or {}
-    
-    # Find task key
-    key = None
-    if task_base:
-        for k in results.keys():
-            if str(k).startswith(task_base):
-                key = k
-                break
-    if key is None:
-        key = next(iter(results.keys())) if results else 'unknown'
-        
-    r = results.get(key, {})
-    em = r.get('extractive_match')
-    em_se = r.get('extractive_match_stderr')
+
+    if not results:
+        return []
 
     cg = data.get('config_general', {}) or {}
     model = cg.get('model_name') or cg.get('model_config', {}).get('model_name', '')
 
-    return {
-        'task': key,
-        'strict': em,
-        'flex': None,
-        'strict_se': em_se,
-        'flex_se': None,
-        'n_eff': None,
-        'model': model,
-        'source': str(json_path)
-    }
+    extracted = []
+
+    for task in results.keys():
+        r = results.get(task, {})
+        em = r.get('extractive_match')
+        em_se = r.get('extractive_match_stderr')
+
+        extracted.append({
+            'task': task,
+            'strict': em,
+            'strict_se': em_se,
+            'flex': None,
+            'flex_se': None,
+            'accuracy': None,
+            'accuracy_se': None,
+            'n_eff': None,
+            'model': model,
+            'source': str(json_path)
+        })
+
+    return extracted
 
 
 def pct(x: Any) -> str:
@@ -177,6 +194,45 @@ def se(x: Any) -> str:
         return ''
 
 
+def build_row(meta: Dict[str, Any], m: Dict[str, Any]) -> Dict[str, Any]:
+    """Build a result row from metadata and extracted metrics."""
+    row = {
+        'model': m.get('model') or meta.get('model', 'unknown'),
+        'hw': meta.get('hw', 'unknown').upper(),
+        'framework': meta.get('framework', 'unknown').lower(),
+        'precision': meta.get('precision', 'unknown').lower(),
+        'isl': int(meta.get('isl', 0)),
+        'osl': int(meta.get('osl', 0)),
+        'tp': int(meta.get('tp', 1)),
+        'ep': int(meta.get('ep', 1)),
+        'conc': int(meta.get('conc', 0)),
+        'dp_attention': str(meta.get('dp_attention', False)).lower(),
+        'task': m.get('task', 'unknown'),
+        'em_strict': m.get('strict'),
+        'em_strict_se': m.get('strict_se'),
+        'em_flexible': m.get('flex'),
+        'em_flexible_se': m.get('flex_se'),
+        'n_eff': m.get('n_eff'),
+        'source': m.get('source'),
+    }
+
+    # Add universal score field (primary metric for unified comparison)
+    if m.get('strict') is not None:
+        row['score'] = m.get('strict')
+        row['score_name'] = 'em_strict'
+        row['score_se'] = m.get('strict_se')
+    elif m.get('accuracy') is not None:
+        row['score'] = m.get('accuracy')
+        row['score_name'] = 'accuracy'
+        row['score_se'] = m.get('accuracy_se')
+    else:
+        row['score'] = None
+        row['score_name'] = None
+        row['score_se'] = None
+
+    return row
+
+
 def main():
     if len(sys.argv) < 3:
         print('Usage: collect_eval_results.py <results_dir> <exp_name>')
@@ -189,39 +245,22 @@ def main():
     for d in find_eval_sets(root):
         meta = load_json(d / 'meta_env.json') or {}
         lm_path, le_path = detect_eval_jsons(d)
-        
-        # Extract metrics (prefer lm-eval)
+
+        # Extract metrics (prefer lm-eval) - returns list for multi-task support
         if lm_path:
-            m = extract_lm_metrics(lm_path)
+            metrics_list = extract_lm_metrics(lm_path)
         elif le_path:
-            m = extract_lighteval_metrics(le_path)
+            metrics_list = extract_lighteval_metrics(le_path)
         else:
             continue
 
-        if not m:
+        if not metrics_list:
             continue
 
-        # Build row from meta + metrics
-        row = {
-            'model': m.get('model') or meta.get('model', 'unknown'),
-            'hw': meta.get('hw', 'unknown').upper(),
-            'framework': meta.get('framework', 'unknown').lower(),
-            'precision': meta.get('precision', 'unknown').lower(),
-            'isl': int(meta.get('isl', 0)),
-            'osl': int(meta.get('osl', 0)),
-            'tp': int(meta.get('tp', 1)),
-            'ep': int(meta.get('ep', 1)),
-            'conc': int(meta.get('conc', 0)),
-            'dp_attention': str(meta.get('dp_attention', False)).lower(),
-            'task': m.get('task', 'unknown'),
-            'em_strict': m.get('strict'),
-            'em_strict_se': m.get('strict_se'),
-            'em_flexible': m.get('flex'),
-            'em_flexible_se': m.get('flex_se'),
-            'n_eff': m.get('n_eff'),
-            'source': m.get('source'),
-        }
-        rows.append(row)
+        # Build row for each task in the results
+        for m in metrics_list:
+            row = build_row(meta, m)
+            rows.append(row)
 
     # Sort for stable output
     rows.sort(key=lambda r: (
@@ -233,10 +272,10 @@ def main():
     else:
         # Print table using tabulate
         headers = [
-            MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL, TP, EP, CONC, DP_ATTENTION, 
-            TASK, EM_STRICT, EM_FLEXIBLE, N_EFF
+            MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL, TP, EP, CONC, DP_ATTENTION,
+            TASK, SCORE, EM_STRICT, EM_FLEXIBLE, N_EFF
         ]
-        
+
         table_rows = [
             [
                 r['model'],
@@ -250,13 +289,14 @@ def main():
                 r['conc'],
                 r['dp_attention'],
                 r['task'],
+                f"{pct(r['score'])}{se(r['score_se'])}",
                 f"{pct(r['em_strict'])}{se(r['em_strict_se'])}",
                 f"{pct(r['em_flexible'])}{se(r['em_flexible_se'])}",
                 r['n_eff'] or ''
             ]
             for r in rows
         ]
-        
+
         print(tabulate(table_rows, headers=headers, tablefmt="github"))
 
     # Write JSON aggregate
diff --git a/utils/summarize.py b/utils/summarize.py
index c40754ab7..5e248164f 100644
--- a/utils/summarize.py
+++ b/utils/summarize.py
@@ -36,6 +36,7 @@
 
 # Eval constants
 TASK = "Task"
+SCORE = "Score"
 EM_STRICT = "EM Strict"
 EM_FLEXIBLE = "EM Flexible"
 N_EFF = "N (eff)"

From d55c79622c43151fe752fd277bc47fe982eee650 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Mon, 12 Jan 2026 21:06:05 -0800
Subject: [PATCH 199/214] diagnostic

---
 .github/workflows/benchmark-tmpl.yml     | 1 +
 benchmarks/benchmark_lib.sh              | 4 ++--
 benchmarks/gptoss_fp4_b200_trt_docker.sh | 2 +-
 benchmarks/gptoss_fp4_b200_trt_slurm.sh  | 2 +-
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 73feb7ee2..addd11521 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -195,6 +195,7 @@ jobs:
           path: |
             meta_env.json
             results*.json
+            sample*.jsonl
           if-no-files-found: ignore
 
       - name: Cleanup eval outputs (post-upload)
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 27a3aea60..4d8148387 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -309,7 +309,7 @@ run_lm_eval() {
     python3 -m lm_eval --model local-chat-completions --apply_chat_template \
       --tasks "utils/evals/${task}.yaml" \
       --num_fewshot "${num_fewshot}" \
-      --output_path "${results_dir}" \
+      --output_path "${results_dir}" --log_samples \
       --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=</s>,max_retries=2,num_concurrent=${concurrent_requests},tokenized_requests=False,max_length=${gen_max_tokens}" \
       --gen_kwargs "max_tokens=${gen_max_tokens},temperature=${temperature},top_p=${top_p}"
     set +x
@@ -373,7 +373,7 @@ META
             if [ "$base" != "meta_env.json" ]; then
                 mv -f "$jf" ./ || true
             fi
-        done < <(find "${out_dir}" -type f -name "*.json" -print0 2>/dev/null)
+        done < <(find "${out_dir}" -type f -name "*.json*" -print0 2>/dev/null)
     fi
 
     # Best-effort cleanup of the temp directory
diff --git a/benchmarks/gptoss_fp4_b200_trt_docker.sh b/benchmarks/gptoss_fp4_b200_trt_docker.sh
index 61ffe6318..64c0556cd 100644
--- a/benchmarks/gptoss_fp4_b200_trt_docker.sh
+++ b/benchmarks/gptoss_fp4_b200_trt_docker.sh
@@ -91,7 +91,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC ))
     append_lm_eval_summary
 fi
 set +x
diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh
index 37b6edf63..b82f562de 100644
--- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh
+++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh
@@ -106,7 +106,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC ))
     append_lm_eval_summary
 fi
 set +x

From fcd14e228a1218c76a961b92220bbc1c7f82fbcb Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Tue, 13 Jan 2026 13:13:40 -0800
Subject: [PATCH 200/214] test dp_attn

---
 benchmarks/gptoss_fp4_b200_trt_slurm.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh
index 954d7ca93..ce397265c 100644
--- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh
+++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh
@@ -29,6 +29,7 @@ PORT=$(( 8888 + $PORT_OFFSET ))
 
 # ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC =========
 MOE_BACKEND="TRTLLM"
+export DP_ATTENTION=false
 
 echo "MOE_BACKEND set to '$MOE_BACKEND'"
 

From c9025452b80a921479c7d9e546c12cdf14894934 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Wed, 14 Jan 2026 11:48:12 -0800
Subject: [PATCH 201/214] DP_ATTENTION back

---
 benchmarks/gptoss_fp4_b200_trt_slurm.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh
index ce397265c..954d7ca93 100644
--- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh
+++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh
@@ -29,7 +29,6 @@ PORT=$(( 8888 + $PORT_OFFSET ))
 
 # ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC =========
 MOE_BACKEND="TRTLLM"
-export DP_ATTENTION=false
 
 echo "MOE_BACKEND set to '$MOE_BACKEND'"
 

From 715269c22031a2cea1621a26c38a63e24dbbc6b1 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Thu, 15 Jan 2026 09:26:21 -0800
Subject: [PATCH 202/214] REMOVE LIGHTEVAL

---
 benchmarks/benchmark_lib.sh   | 290 ----------------------------------
 utils/collect_eval_results.py |  45 +-----
 utils/evals/EVALS.md          |   4 +-
 utils/evals/custom_gsm8k.py   |  22 ---
 4 files changed, 2 insertions(+), 359 deletions(-)
 delete mode 100644 utils/evals/custom_gsm8k.py

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 4d8148387..ec33311b1 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -384,295 +384,6 @@ META
     echo "Moved eval artifacts to: $(pwd)"
 }
 
-# ------------------------------
-# Lighteval + LiteLLM patching
-# ------------------------------
-
-_install_lighteval_deps() {
-    python3 -m pip install -q --no-cache-dir "lighteval==0.13.0" "litellm==1.80.7" || true
-}
-
-# Patch lighteval's LiteLLMClient to handle reasoning content and Python name mangling
-# 1. Removed "response_format": {"type": "text"}, as it interferred with vllm endpoint
-# 2. Concat reasoning with output tokens as sometimes the output is empty.
-_patch_lighteval_litellm() {
-    local patch_dir
-    patch_dir="$(mktemp -d)"
-    cat > "$patch_dir/sitecustomize.py" <<'PY'
-import logging
-import os
-import time
-import re
-from concurrent.futures import ThreadPoolExecutor, as_completed
-
-import litellm
-from tqdm import tqdm
-
-litellm.suppress_debug_info = True
-litellm.drop_params = True
-
-# Remove sglang import that crashes
-try:
-    # This is where lighteval's is_package_available lives
-    from lighteval.utils import imports as le_imports
-except Exception:
-    le_imports = None
-else:
-    _orig_is_package_available = le_imports.is_package_available
-
-    def _patched_is_package_available(pkg: str) -> bool:
-        # Force "sglang" to look unavailable so that
-        # lighteval.models.sglang.sglang_model never imports `sglang`
-        if pkg == "sglang":
-            return False
-        return _orig_is_package_available(pkg)
-
-    le_imports.is_package_available = _patched_is_package_available
-
-from lighteval.models.endpoints.litellm_model import LiteLLMClient
-from lighteval.data import GenerativeTaskDataset
-from lighteval.tasks.requests import Doc
-from lighteval.models.model_output import ModelResponse
-
-logger = logging.getLogger(__name__)
-
-def _patched___call_api(self, prompt, return_logits, max_new_tokens, num_samples, stop_sequence):  # noqa: C901, N802
-    from lighteval.models.endpoints.litellm_model import LitellmModelResponse
-    response = LitellmModelResponse()
-    # Keep dataset-provided stop sequences to cut early
-    max_new_tokens = self._prepare_max_new_tokens(max_new_tokens)
-
-    if return_logits and not self.provider == "openai":
-        logger.warning("Returning logits is not supported for this provider, ignoring.")
-
-    kwargs = {
-        "model": self.model,
-        "messages": prompt,
-        "max_tokens": max_new_tokens,
-        "logprobs": return_logits if self.provider == "openai" else None,
-        "stop": stop_sequence,
-        "base_url": self.base_url,
-        "api_key": self.api_key,
-        "n": num_samples,
-        "timeout": self.timeout,
-    }
-
-    # vLLM/SGLang OpenAI servers: apply chat template and start assistant turn
-    if (
-        self.provider == "openai"
-        and isinstance(self.base_url, str)
-        and self.base_url
-        and ("api.openai.com" not in self.base_url)
-    ):
-        kwargs["extra_body"] = {"use_chat_template": True, "add_generation_prompt": True}
-
-    if "o1" in self.model:
-        logger.warning("O1 models do not support temperature, top_p, stop sequence. Disabling.")
-    else:
-        kwargs.update(self.generation_parameters.to_litellm_dict())
-
-    if kwargs.get("max_completion_tokens", None) is None:
-        kwargs["max_completion_tokens"] = max_new_tokens
-
-    for attempt in range(self.API_MAX_RETRY):
-        try:
-            response = litellm.completion(**kwargs)
-            msg = response.choices[0].message
-            content = getattr(msg, "content", None)
-            reasoning = getattr(msg, "reasoning_content", None)
-
-            # Accept reasoning-only replies
-            if (not content) and reasoning:
-                return response
-
-            return response
-        except litellm.BadRequestError as e:
-            if "message" in e.__dict__ and "policy" in e.__dict__["message"]:
-                logger.warning("Content filtered. Returning empty response.")
-                return LitellmModelResponse()
-        except Exception as e:
-            wait_time = min(64, self.API_RETRY_SLEEP * (self.API_RETRY_MULTIPLIER**attempt))
-            logger.warning(f"Error: {e}, waiting {wait_time}s before retry {attempt + 1}/{self.API_MAX_RETRY}")
-            time.sleep(wait_time)
-
-    logger.error(f"API call failed after {self.API_MAX_RETRY} attempts.")
-    return LitellmModelResponse()
-
-
-def _patched___call_api_parallel(self, prompts, return_logits, max_new_tokens, num_samples, stop_sequence):  # noqa: N802
-    # Build per-item args
-    return_logitss = [return_logits for _ in prompts] if not isinstance(return_logits, list) else return_logits
-    max_new_tokenss = [max_new_tokens for _ in prompts] if not isinstance(max_new_tokens, list) else max_new_tokens
-    num_sampless = [num_samples for _ in prompts] if not isinstance(num_samples, list) else num_samples
-    stop_sequencess = [stop_sequence for _ in prompts]
-
-    n = len(prompts)
-    assert n == len(return_logitss) == len(max_new_tokenss) == len(num_sampless) == len(stop_sequencess), (
-        f"Length mismatch: {len(prompts)}, {len(return_logitss)}, {len(max_new_tokenss)}, "
-        f"{len(num_sampless)}, {len(stop_sequencess)}"
-    )
-
-    results = [None] * n
-    with ThreadPoolExecutor(self.concurrent_requests) as executor:
-        futures = []
-        for idx in range(n):
-            fut = executor.submit(
-                self._LiteLLMClient__call_api,
-                prompts[idx],
-                return_logitss[idx],
-                max_new_tokenss[idx],
-                num_sampless[idx],
-                stop_sequencess[idx],
-            )
-            fut._le_idx = idx  # attach index for order restoration
-            futures.append(fut)
-
-        for fut in tqdm(as_completed(futures), total=n, disable=self.disable_tqdm):
-            idx = getattr(fut, "_le_idx", None)
-            try:
-                res = fut.result()
-            except Exception:
-                res = None
-            if idx is not None:
-                results[idx] = res
-
-    if any(r is None for r in results):
-        raise ValueError("Some entries are not annotated due to errors in __call_api_parallel, please inspect and retry.")
-
-    return results
-
-
-def _greedy_until_impl(self, docs: list[Doc]) -> list[ModelResponse]:
-    dataset = GenerativeTaskDataset(requests=docs, num_dataset_splits=self.DATASET_SPLITS)
-    results: list[ModelResponse] = []
-
-    for split in tqdm(
-        dataset.splits_iterator(),
-        total=dataset.num_dataset_splits,
-        desc="Splits",
-        position=0,
-        disable=self.disable_tqdm,
-    ):
-        contexts = [self.prompt_manager.prepare_prompt_api(doc) for doc in split]
-
-        max_new_tokens = split[0].generation_size
-        return_logits = split[0].use_logits
-        num_samples = split[0].num_samples
-        stop_sequence = split[0].stop_sequences
-
-        if num_samples > 1 and self.generation_parameters.temperature == 0:
-            raise ValueError("num_samples > 1 requires temperature > 0")
-
-        responses = self._LiteLLMClient__call_api_parallel(
-            contexts,
-            return_logits,
-            max_new_tokens,
-            num_samples,
-            stop_sequence,
-        )
-
-        for response, context in zip(responses, contexts):
-            merged_texts: list[str] = []
-            reasonings: list[str | None] = []
-
-            for choice in response.choices:
-                msg = choice.message
-                raw_content = getattr(msg, "content", None) or ""
-                reasoning = getattr(msg, "reasoning_content", None)
-
-                # For answer extraction, use only the content field
-                # The reasoning is stored separately for logging/debugging
-                merged_texts.append(raw_content.strip() if raw_content else "")
-                reasonings.append(reasoning if reasoning else None)
-
-            if not merged_texts or merged_texts[0] is None:
-                merged_texts = [""]
-
-            results.append(
-                ModelResponse(
-                    text=merged_texts,
-                    reasonings=reasonings,
-                    input=context,
-                )
-            )
-
-    if len(results) != len(dataset):
-        raise RuntimeError(f"Internal mismatch: {len(results)} outputs vs {len(dataset)} docs.")
-
-    return dataset.get_original_order(results)
-
-# Bind patches
-LiteLLMClient._LiteLLMClient__call_api = _patched___call_api
-LiteLLMClient._LiteLLMClient__call_api_parallel = _patched___call_api_parallel
-#LiteLLMClient.greedy_until = _greedy_until_impl
-PY
-    export PYTHONPATH="${patch_dir}:${PYTHONPATH:-}"
-}
-
-run_lighteval_eval() {
-    local port="${PORT:-8888}"
-    local task="${EVAL_TASK:-gsm8k}"
-    local num_fewshot="${NUM_FEWSHOT:-5}"
-    local results_dir="${EVAL_RESULT_DIR:-$(mktemp -d /tmp/eval_out-XXXXXX)}"
-    local max_samples=0
-    local concurrent_requests=32
-
-    while [[ $# -gt 0 ]]; do
-        case $1 in
-            --port)        port="$2"; shift 2 ;;
-            --task)        task="$2"; shift 2 ;;
-            --num-fewshot) num_fewshot="$2"; shift 2 ;;
-            --results-dir) results_dir="$2"; shift 2 ;;
-            --max-samples) max_samples="$2"; shift 2 ;;
-            --concurrent-requests) concurrent_requests="$2"; shift 2 ;;
-            *)             echo "Unknown parameter: $1"; return 1 ;;
-        esac
-    done
-
-    _install_lighteval_deps
-    _patch_lighteval_litellm
-
-    # For lighteval, MODEL_NAME MUST BE SET
-    local model_name="${MODEL_NAME}"
-    if [[ -z "$model_name" ]]; then
-        echo "Error: MODEL not set for lighteval." >&2
-        return 1
-    fi
-
-    # LiteLLM provider prefix logic
-    local lite_model="$model_name"
-    if [[ "$lite_model" != openai/* ]]; then
-        lite_model="openai/${lite_model}"
-    fi
-
-    local base_url="http://0.0.0.0:${port}/v1"
-    export OPENAI_API_KEY="${OPENAI_API_KEY:-EMPTY}"
-
-    local MODEL_ARGS="model_name=${lite_model},base_url=${base_url},api_key=${OPENAI_API_KEY},generation_parameters={temperature:0.0,top_p:1,max_new_tokens:2048},concurrent_requests=${concurrent_requests}"
-    local TASK_SPEC="${task}|${num_fewshot}"
-
-    # Respect absolute paths (e.g., /tmp/eval_out); otherwise write under /workspace
-    local output_dir
-    if [[ "$results_dir" = /* ]]; then
-        output_dir="$results_dir"
-    else
-        output_dir="/workspace/${results_dir}"
-    fi
-
-    # Make output dir visible to append_lm_eval_summary
-    export EVAL_RESULT_DIR="$output_dir"
-
-    set -x
-    lighteval endpoint litellm \
-        "${MODEL_ARGS}" \
-        "${TASK_SPEC}" \
-        --output-dir "${output_dir}" \
-        --custom-tasks utils/evals/custom_gsm8k.py \
-        --max-samples "${max_samples}"
-    set +x
-}
-
-
 # ------------------------------
 # Unified eval entrypoint
 # ------------------------------
@@ -690,7 +401,6 @@ run_eval() {
 
     case "$framework" in
         lm-eval|lm_eval) run_lm_eval "${forwarded[@]}" ;;
-        lighteval)       run_lighteval_eval "${forwarded[@]}" ;;
         *)               echo "Unknown framework '${framework}'"; return 1 ;;
     esac
 }
diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py
index 11752bef3..5ffaa0cc9 100644
--- a/utils/collect_eval_results.py
+++ b/utils/collect_eval_results.py
@@ -29,7 +29,7 @@ def find_eval_sets(root: Path) -> List[Path]:
 
 
 def detect_eval_jsons(d: Path) -> Tuple[Optional[Path], Optional[Path]]:
-    """Return (lm_eval_json, lighteval_json) if present.
+    """Return (lm_eval_json) if present.
     
     Checks immediate directory for result JSONs.
     """
@@ -49,10 +49,6 @@ def detect_eval_jsons(d: Path) -> Tuple[Optional[Path], Optional[Path]]:
             # lm-eval harness - pick latest if multiple
             if lm_path is None or p.stat().st_mtime > lm_path.stat().st_mtime:
                 lm_path = p
-        elif 'config_general' in data and 'results' in data:
-            # lighteval - pick latest if multiple
-            if le_path is None or p.stat().st_mtime > le_path.stat().st_mtime:
-                le_path = p
                 
     return lm_path, le_path
 
@@ -141,43 +137,6 @@ def get_val_se(filter_name: str) -> Tuple[Optional[float], Optional[float]]:
     return extracted
 
 
-def extract_lighteval_metrics(json_path: Path) -> List[Dict[str, Any]]:
-    """Extract metrics from lighteval result JSON.
-
-    Returns a list of metric dicts, one per task in the results.
-    """
-    data = load_json(json_path) or {}
-    results = data.get('results', {}) or {}
-
-    if not results:
-        return []
-
-    cg = data.get('config_general', {}) or {}
-    model = cg.get('model_name') or cg.get('model_config', {}).get('model_name', '')
-
-    extracted = []
-
-    for task in results.keys():
-        r = results.get(task, {})
-        em = r.get('extractive_match')
-        em_se = r.get('extractive_match_stderr')
-
-        extracted.append({
-            'task': task,
-            'strict': em,
-            'strict_se': em_se,
-            'flex': None,
-            'flex_se': None,
-            'accuracy': None,
-            'accuracy_se': None,
-            'n_eff': None,
-            'model': model,
-            'source': str(json_path)
-        })
-
-    return extracted
-
-
 def pct(x: Any) -> str:
     """Format value as percentage."""
     try:
@@ -249,8 +208,6 @@ def main():
         # Extract metrics (prefer lm-eval) - returns list for multi-task support
         if lm_path:
             metrics_list = extract_lm_metrics(lm_path)
-        elif le_path:
-            metrics_list = extract_lighteval_metrics(le_path)
         else:
             continue
 
diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md
index 511c80804..fcdcd5360 100644
--- a/utils/evals/EVALS.md
+++ b/utils/evals/EVALS.md
@@ -15,14 +15,12 @@ To verify how model outputs are affected by throughput optimizations.
 - Check kernel implementations for correctness
 
 ## How?
-- `run_eval`, definined in `benchmarks/benchmark_lib.sh`, is called in `benchmarks/*`. Either  EleutherAI/lm-evaluation-harness(lmeval) or  lighteval with litellm is ran, using the same endpoint as the throughput benchmark. JSON results are processed and converted to a table with `utils/collect_eval_results.py`.
+- `run_eval`, definined in `benchmarks/benchmark_lib.sh`, is called in `benchmarks/*`. EleutherAI/lm-evaluation-harness(lmeval), using the same endpoint as the throughput benchmark. JSON results are processed and converted to a table with `utils/collect_eval_results.py`.
 
 ## Misc
 Following files are task definitions from lmeval, more info on changes within the files
 - `utils/evals/math500.yaml`
 - `utils/evals/gsm8k.yaml`
-Following files are task definitions from lighteval, more info on changes within the files
-- `utils/evals/custom_gsm8k.py`
 
 
 
diff --git a/utils/evals/custom_gsm8k.py b/utils/evals/custom_gsm8k.py
deleted file mode 100644
index ac6c0b9be..000000000
--- a/utils/evals/custom_gsm8k.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copied from https://github.com/huggingface/lighteval/blob/99ef5b98d422cf3620eebec9db13285493d35542/src/lighteval/tasks/tasks/gsm8k.py
-# Increases generation size to 768 from 256 to better accommodate longer solutions by dsr1.
-from lighteval.metrics.metrics import Metrics
-from lighteval.tasks.lighteval_task import LightevalTaskConfig
-from lighteval.tasks.tasks.gsm8k import gsm8k_prompt
-
-gsm8k_long = LightevalTaskConfig(
-    name="gsm8k_long",
-    prompt_function=gsm8k_prompt,
-    hf_repo="openai/gsm8k",
-    hf_subset="main",
-    hf_avail_splits=["train", "test"],
-    evaluation_splits=["test"],
-    few_shots_split=None,
-    few_shots_select="random_sampling_from_train",
-    generation_size=1024,         # raised this from 256
-    metrics=[Metrics.expr_gold_metric],
-    stop_sequence=None,           # avoid early stop on "Question:"
-    version=0,
-)
-
-TASKS_TABLE = [gsm8k_long]

From a353ea4ab54cd271aef019c0db49983b8eed457f Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Mon, 19 Jan 2026 22:37:17 -0800
Subject: [PATCH 203/214] Add evals for atom, trt_mtp

---
 benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh  | 7 +++++++
 benchmarks/dsr1_fp4_mi355x_atom_slurm.sh   | 6 ++++++
 benchmarks/dsr1_fp8_mi355x_atom_slurm.sh   | 6 ++++++
 benchmarks/gptoss_fp4_mi355x_atom_slurm.sh | 6 ++++++
 4 files changed, 25 insertions(+)

diff --git a/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh
index 33d819efa..104a33ca2 100644
--- a/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh
+++ b/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh
@@ -102,3 +102,10 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/ \
     --use-chat-template
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    append_lm_eval_summary
+fi
+set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp4_mi355x_atom_slurm.sh b/benchmarks/dsr1_fp4_mi355x_atom_slurm.sh
index 8028ae449..c50273d60 100644
--- a/benchmarks/dsr1_fp4_mi355x_atom_slurm.sh
+++ b/benchmarks/dsr1_fp4_mi355x_atom_slurm.sh
@@ -65,3 +65,9 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
 
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    append_lm_eval_summary
+fi
+set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp8_mi355x_atom_slurm.sh b/benchmarks/dsr1_fp8_mi355x_atom_slurm.sh
index 8028ae449..c50273d60 100644
--- a/benchmarks/dsr1_fp8_mi355x_atom_slurm.sh
+++ b/benchmarks/dsr1_fp8_mi355x_atom_slurm.sh
@@ -65,3 +65,9 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
 
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    append_lm_eval_summary
+fi
+set +x
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_mi355x_atom_slurm.sh b/benchmarks/gptoss_fp4_mi355x_atom_slurm.sh
index 953505ee9..560e29df6 100644
--- a/benchmarks/gptoss_fp4_mi355x_atom_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi355x_atom_slurm.sh
@@ -66,3 +66,9 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
 
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    append_lm_eval_summary
+fi
+set +x
\ No newline at end of file

From d6d4055b85b82267c3b81b0f0c4dbc12f1a90e12 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Mon, 19 Jan 2026 22:59:47 -0800
Subject: [PATCH 204/214] remove tokenizer from benchmarkserving

---
 benchmarks/dsr1_fp8_h200_slurm.sh      | 4 ++--
 benchmarks/gptoss_fp4_b200_docker.sh   | 1 -
 benchmarks/gptoss_fp4_h100_docker.sh   | 1 -
 benchmarks/gptoss_fp4_h100_slurm.sh    | 1 -
 benchmarks/gptoss_fp4_h200_slurm.sh    | 1 -
 benchmarks/gptoss_fp4_mi300x_docker.sh | 1 -
 benchmarks/gptoss_fp4_mi325x_slurm.sh  | 1 -
 benchmarks/gptoss_fp4_mi355x_docker.sh | 1 -
 8 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/benchmarks/dsr1_fp8_h200_slurm.sh b/benchmarks/dsr1_fp8_h200_slurm.sh
index 41d649a74..657504290 100644
--- a/benchmarks/dsr1_fp8_h200_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_slurm.sh
@@ -24,7 +24,7 @@ export TORCH_CUDA_ARCH_LIST="9.0"
 
 set -x
 if [[ $ISL -eq 1024 && $OSL -eq 1024 ]]; then
-    PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --tokenizer-path $MODEL \
+    PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL \
     --host 0.0.0.0 --port $PORT --trust-remote-code \
     --tensor-parallel-size=$TP --data-parallel-size=1 \
     --disable-radix-cache --max-running-requests 512 --cuda-graph-max-bs 512 \
@@ -33,7 +33,7 @@ if [[ $ISL -eq 1024 && $OSL -eq 1024 ]]; then
     --decode-log-interval 1 \
     > $SERVER_LOG 2>&1 &
 else
-    PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --tokenizer-path $MODEL \
+    PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL \
     --host 0.0.0.0 --port $PORT --trust-remote-code \
     --tensor-parallel-size=$TP --data-parallel-size=1 \
     --disable-radix-cache --max-running-requests 256 --cuda-graph-max-bs 256 \
diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh
index 44b772a9d..8949fbc93 100644
--- a/benchmarks/gptoss_fp4_b200_docker.sh
+++ b/benchmarks/gptoss_fp4_b200_docker.sh
@@ -66,7 +66,6 @@ pip install -q datasets pandas
 
 run_benchmark_serving \
     --model "$MODEL_NAME" \
-    --tokenizer "$MODEL" \
     --port "$PORT" \
     --backend vllm \
     --input-len "$ISL" \
diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh
index 8851b0a0c..dead5fbc7 100644
--- a/benchmarks/gptoss_fp4_h100_docker.sh
+++ b/benchmarks/gptoss_fp4_h100_docker.sh
@@ -46,7 +46,6 @@ pip install -q datasets pandas
 
 run_benchmark_serving \
     --model "$MODEL_NAME" \
-    --tokenizer "$MODEL" \
     --port "$PORT" \
     --backend vllm \
     --input-len "$ISL" \
diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh
index 99e939c69..ac9d29b2e 100644
--- a/benchmarks/gptoss_fp4_h100_slurm.sh
+++ b/benchmarks/gptoss_fp4_h100_slurm.sh
@@ -47,7 +47,6 @@ pip install -q datasets pandas
 
 run_benchmark_serving \
     --model "$MODEL_NAME" \
-    --tokenizer "$MODEL" \
     --port "$PORT" \
     --backend vllm \
     --input-len "$ISL" \
diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh
index b28a00c3f..31689bd4a 100644
--- a/benchmarks/gptoss_fp4_h200_slurm.sh
+++ b/benchmarks/gptoss_fp4_h200_slurm.sh
@@ -57,7 +57,6 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 
 run_benchmark_serving \
     --model "$MODEL_NAME" \
-    --tokenizer "$MODEL" \
     --port "$PORT" \
     --backend vllm \
     --input-len "$ISL" \
diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh
index c7a39e53f..b9fb586df 100644
--- a/benchmarks/gptoss_fp4_mi300x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi300x_docker.sh
@@ -52,7 +52,6 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 
 run_benchmark_serving \
     --model "$MODEL_NAME" \
-    --tokenizer "$MODEL" \
     --port "$PORT" \
     --backend vllm \
     --input-len "$ISL" \
diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh
index 9eee1a9a3..ba8dd29ad 100644
--- a/benchmarks/gptoss_fp4_mi325x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh
@@ -55,7 +55,6 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 
 run_benchmark_serving \
     --model "$MODEL_NAME" \
-    --tokenizer "$MODEL" \
     --port "$PORT" \
     --backend vllm \
     --input-len "$ISL" \
diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh
index eb26fd467..d04104268 100644
--- a/benchmarks/gptoss_fp4_mi355x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi355x_docker.sh
@@ -49,7 +49,6 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 
 run_benchmark_serving \
     --model "$MODEL_NAME" \
-    --tokenizer "$MODEL" \
     --port "$PORT" \
     --backend vllm \
     --input-len "$ISL" \

From 338d80cefb415ce3c0db0410e3c06eb083ab8cc8 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Mon, 19 Jan 2026 23:15:17 -0800
Subject: [PATCH 205/214] remove model_name

---
 benchmarks/gptoss_fp4_b200_docker.sh   | 5 ++---
 benchmarks/gptoss_fp4_h100_docker.sh   | 5 ++---
 benchmarks/gptoss_fp4_h100_slurm.sh    | 6 +++---
 benchmarks/gptoss_fp4_h200_slurm.sh    | 6 +++---
 benchmarks/gptoss_fp4_mi300x_docker.sh | 5 ++---
 benchmarks/gptoss_fp4_mi325x_slurm.sh  | 5 ++---
 benchmarks/gptoss_fp4_mi355x_docker.sh | 5 ++---
 7 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh
index 8949fbc93..322f352c0 100644
--- a/benchmarks/gptoss_fp4_b200_docker.sh
+++ b/benchmarks/gptoss_fp4_b200_docker.sh
@@ -46,7 +46,6 @@ export PYTHONNOUSERSITE=1
 export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1
 
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
-MODEL_NAME=${MODEL##*/}
 
 set -x
 vllm serve $MODEL --host 0.0.0.0 --port $PORT \
@@ -55,7 +54,7 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT \
 --tensor-parallel-size $TP \
 --max-num-seqs 512 \
 --disable-log-requests \
---served-model-name $MODEL_NAME > $SERVER_LOG 2>&1 &
+--served-model-name $MODEL > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -65,7 +64,7 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 pip install -q datasets pandas
 
 run_benchmark_serving \
-    --model "$MODEL_NAME" \
+    --model "$MODEL" \
     --port "$PORT" \
     --backend vllm \
     --input-len "$ISL" \
diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh
index dead5fbc7..a4c848119 100644
--- a/benchmarks/gptoss_fp4_h100_docker.sh
+++ b/benchmarks/gptoss_fp4_h100_docker.sh
@@ -26,7 +26,6 @@ EOF
 export PYTHONNOUSERSITE=1
 export VLLM_MXFP4_USE_MARLIN=1
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
-MODEL_NAME=${MODEL##*/}
 
 set -x
 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
@@ -35,7 +34,7 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
 --tensor-parallel-size=$TP \
 --max-num-seqs=$CONC  \
 --disable-log-requests \
---served-model-name $MODEL_NAME > $SERVER_LOG 2>&1 &
+--served-model-name $MODEL > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -45,7 +44,7 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 pip install -q datasets pandas
 
 run_benchmark_serving \
-    --model "$MODEL_NAME" \
+    --model "$MODEL" \
     --port "$PORT" \
     --backend vllm \
     --input-len "$ISL" \
diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh
index ac9d29b2e..2e44f95ed 100644
--- a/benchmarks/gptoss_fp4_h100_slurm.sh
+++ b/benchmarks/gptoss_fp4_h100_slurm.sh
@@ -26,7 +26,7 @@ EOF
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 export TORCH_CUDA_ARCH_LIST="9.0"
 PORT=${PORT:-8888}
-MODEL_NAME=${MODEL##*/}
+
 export VLLM_MXFP4_USE_MARLIN=1
 
 set -x
@@ -36,7 +36,7 @@ PYTHONNOUSERSITE=1 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
   --tensor-parallel-size=$TP \
   --max-num-seqs=$CONC  \
   --disable-log-requests \
-  --served-model-name $MODEL_NAME > $SERVER_LOG 2>&1 &
+  --served-model-name $MODEL > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -46,7 +46,7 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 pip install -q datasets pandas
 
 run_benchmark_serving \
-    --model "$MODEL_NAME" \
+    --model "$MODEL" \
     --port "$PORT" \
     --backend vllm \
     --input-len "$ISL" \
diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh
index 31689bd4a..abe4c2daf 100644
--- a/benchmarks/gptoss_fp4_h200_slurm.sh
+++ b/benchmarks/gptoss_fp4_h200_slurm.sh
@@ -39,7 +39,7 @@ EOF
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 export TORCH_CUDA_ARCH_LIST="9.0"
 PORT=$(( 8888 + $PORT_OFFSET ))
-MODEL_NAME=${MODEL##*/}
+
 export VLLM_MXFP4_USE_MARLIN=1
 
 PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT \
@@ -48,7 +48,7 @@ PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT \
  --tensor-parallel-size $TP \
  --max-num-seqs $CONC  \
  --disable-log-requests \
- --served-model-name $MODEL_NAME > $SERVER_LOG 2>&1 &
+ --served-model-name $MODEL > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -56,7 +56,7 @@ SERVER_PID=$!
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
 run_benchmark_serving \
-    --model "$MODEL_NAME" \
+    --model "$MODEL" \
     --port "$PORT" \
     --backend vllm \
     --input-len "$ISL" \
diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh
index b9fb586df..1019f2086 100644
--- a/benchmarks/gptoss_fp4_mi300x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi300x_docker.sh
@@ -30,7 +30,6 @@ export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0
 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
 
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
-MODEL_NAME=${MODEL##*/}
 
 set -x
 vllm serve $MODEL --port $PORT \
@@ -42,7 +41,7 @@ vllm serve $MODEL --port $PORT \
 --block-size=64 \
 --no-enable-prefix-caching \
 --disable-log-requests \
---served-model-name $MODEL_NAME \
+--served-model-name $MODEL \
 --async-scheduling > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
@@ -51,7 +50,7 @@ SERVER_PID=$!
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
 run_benchmark_serving \
-    --model "$MODEL_NAME" \
+    --model "$MODEL" \
     --port "$PORT" \
     --backend vllm \
     --input-len "$ISL" \
diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh
index ba8dd29ad..16f9729ac 100644
--- a/benchmarks/gptoss_fp4_mi325x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh
@@ -33,7 +33,6 @@ fi
 export VLLM_USE_AITER_UNIFIED_ATTENTION=1
 export VLLM_ROCM_USE_AITER_MHA=0
 export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0
-MODEL_NAME=${MODEL##*/}
 
 set -x
 vllm serve $MODEL --port $PORT \
@@ -45,7 +44,7 @@ vllm serve $MODEL --port $PORT \
 --block-size=64 \
 --no-enable-prefix-caching \
 --disable-log-requests \
---served-model-name $MODEL_NAME \
+--served-model-name $MODEL \
 --async-scheduling > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
@@ -54,7 +53,7 @@ SERVER_PID=$!
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
 run_benchmark_serving \
-    --model "$MODEL_NAME" \
+    --model "$MODEL" \
     --port "$PORT" \
     --backend vllm \
     --input-len "$ISL" \
diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh
index d04104268..6b772be75 100644
--- a/benchmarks/gptoss_fp4_mi355x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi355x_docker.sh
@@ -27,7 +27,6 @@ export VLLM_ROCM_USE_AITER_MHA=0
 export VLLM_ROCM_USE_AITER_FUSED_MOE_A16W4=1
 
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
-MODEL_NAME=${MODEL##*/}
 
 set -x
 vllm serve $MODEL --port $PORT \
@@ -39,7 +38,7 @@ vllm serve $MODEL --port $PORT \
 --block-size=64 \
 --no-enable-prefix-caching \
 --disable-log-requests \
---served-model-name $MODEL_NAME \
+--served-model-name $MODEL \
 --async-scheduling > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
@@ -48,7 +47,7 @@ SERVER_PID=$!
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
 run_benchmark_serving \
-    --model "$MODEL_NAME" \
+    --model "$MODEL" \
     --port "$PORT" \
     --backend vllm \
     --input-len "$ISL" \

From e28631cdf5e81f1df4f0df2c7cd38db0eac886af Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Tue, 20 Jan 2026 11:20:59 -0800
Subject: [PATCH 206/214] More evals for spec decode

---
 benchmarks/benchmark_lib.sh                  |  1 +
 utils/collect_eval_results.py                | 15 +++++++------
 utils/matrix_logic/generate_sweep_configs.py | 22 ++++++++++++++------
 utils/summarize.py                           |  1 +
 4 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index ba214c61e..8533d54bd 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -398,6 +398,7 @@ append_lm_eval_summary() {
 {
   "framework": "${fw:-unknown}",
   "precision": "${prec:-unknown}",
+  "spec_decoding": "${SPEC_DECODING}",
   "tp": ${TP:-1},
   "conc": ${CONC:-1},
   "ep": ${EP_SIZE:-1},
diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py
index 5ffaa0cc9..8bf3cf66b 100644
--- a/utils/collect_eval_results.py
+++ b/utils/collect_eval_results.py
@@ -8,8 +8,9 @@
 # Import shared utilities from summarize
 sys.path.insert(0, str(Path(__file__).resolve().parent))
 from summarize import (
-    load_json, MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL,
-    TP, EP, CONC, DP_ATTENTION, TASK, SCORE, EM_STRICT, EM_FLEXIBLE, N_EFF
+    load_json, MODEL, HARDWARE, FRAMEWORK, PRECISION,
+    TP, EP, CONC, DP_ATTENTION, TASK, SCORE, EM_STRICT, EM_FLEXIBLE, N_EFF,
+    SPEC_DECODING
 )
 
 
@@ -160,8 +161,7 @@ def build_row(meta: Dict[str, Any], m: Dict[str, Any]) -> Dict[str, Any]:
         'hw': meta.get('hw', 'unknown').upper(),
         'framework': meta.get('framework', 'unknown').lower(),
         'precision': meta.get('precision', 'unknown').lower(),
-        'isl': int(meta.get('isl', 0)),
-        'osl': int(meta.get('osl', 0)),
+        'spec_decoding': meta.get('spec_decoding', 'unknown'),
         'tp': int(meta.get('tp', 1)),
         'ep': int(meta.get('ep', 1)),
         'conc': int(meta.get('conc', 0)),
@@ -221,7 +221,7 @@ def main():
 
     # Sort for stable output
     rows.sort(key=lambda r: (
-        r['hw'], r['framework'], r['precision'], r['isl'], r['osl'], r['tp'], r['ep'], r['conc']
+        r['hw'], r['framework'], r['precision'], r.get('spec_decoding', ''), r['tp'], r['ep'], r['conc']
     ))
 
     if not rows:
@@ -229,7 +229,7 @@ def main():
     else:
         # Print table using tabulate
         headers = [
-            MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL, TP, EP, CONC, DP_ATTENTION,
+            MODEL, HARDWARE, FRAMEWORK, PRECISION, SPEC_DECODING, TP, EP, CONC, DP_ATTENTION,
             TASK, SCORE, EM_STRICT, EM_FLEXIBLE, N_EFF
         ]
 
@@ -239,8 +239,7 @@ def main():
                 r['hw'],
                 r['framework'].upper(),
                 r['precision'].upper(),
-                r['isl'],
-                r['osl'],
+                r['spec_decoding'],
                 r['tp'],
                 r['ep'],
                 r['conc'],
diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py
index ecedda9ef..b6c2cf2f2 100644
--- a/utils/matrix_logic/generate_sweep_configs.py
+++ b/utils/matrix_logic/generate_sweep_configs.py
@@ -1,3 +1,4 @@
+from ast import For
 import json
 import argparse
 import sys
@@ -32,14 +33,19 @@ def seq_len_to_str(isl: int, osl: int) -> str:
     return seq_len_itos.get((isl, osl), f"{isl}_{osl}")
 
 def mark_eval_entries(matrix_values: list[dict]) -> list[dict]:
-    """Mark entries that should run evaluation.
-    
-    For each unique (model, runner, framework, precision, isl, osl) combination:
-    - Mark highest TP with highest conc
-    - Mark lowest TP with highest conc
+    """Eval selection policy (single-node only):
+    - Only consider 1k8k (isl=1024, osl=8192).
+    - For each unique (model, runner, framework, precision, isl, osl, spec-decoding):
+        - Mark highest TP with highest conc
+        - Mark lowest TP with highest conc
+        
+    Grouping includes spec-decoding so MTP (mtp) and non-MTP (none) are treated
+    independently.
     """
     from collections import defaultdict
 
+    # Only run evals on 1k8k
+    target_isl, target_osl = seq_len_stoi["1k8k"]
     # Group entries by (model, runner, framework, precision, isl, osl)
     # Only include entries that have a top-level TP (i.e., single-node schema).
     # This avoids relying on structural hints like prefill/decode which may be
@@ -50,13 +56,17 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]:
         if Fields.TP.value not in entry:
             continue
 
+        if entry.get(Fields.ISL.value) != target_isl or entry.get(Fields.OSL.value) != target_osl:
+            continue
+
         key = (
             entry[Fields.MODEL.value],
             entry[Fields.RUNNER.value],
             entry[Fields.FRAMEWORK.value],
             entry[Fields.PRECISION.value],
             entry[Fields.ISL.value],
-            entry[Fields.OSL.value]
+            entry[Fields.OSL.value],
+            entry[Fields.SPEC_DECODING.value]
         )
         groups[key].append((i, entry))
 
diff --git a/utils/summarize.py b/utils/summarize.py
index 5e248164f..b4f4ce6a1 100644
--- a/utils/summarize.py
+++ b/utils/summarize.py
@@ -40,6 +40,7 @@
 EM_STRICT = "EM Strict"
 EM_FLEXIBLE = "EM Flexible"
 N_EFF = "N (eff)"
+SPEC_DECODING = "Spec Decode"
 
 
 def load_json(path: Path) -> Optional[Dict[str, Any]]:

From fa49cdc2e24a93eee720e17e29a85a61ec80f919 Mon Sep 17 00:00:00 2001
From: Bryan Shan <58582368+Oseltamivir@users.noreply.github.com>
Date: Sun, 18 Jan 2026 21:17:45 -0800
Subject: [PATCH 207/214] claude pr comments

---
 .github/workflows/claude-pr-review.yml |  4 ++++
 .github/workflows/claude.yml           | 31 +++++++++++++++++++++++---
 2 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/claude-pr-review.yml b/.github/workflows/claude-pr-review.yml
index d52d5aeb4..8886fde16 100644
--- a/.github/workflows/claude-pr-review.yml
+++ b/.github/workflows/claude-pr-review.yml
@@ -8,6 +8,10 @@ on:
   pull_request_review_comment:
     types: [created]
 
+concurrency:
+  group: pr-review-${{ github.event.pull_request.number }}
+  cancel-in-progress: false
+
 jobs:
   review:
     runs-on: ubuntu-latest
diff --git a/.github/workflows/claude.yml b/.github/workflows/claude.yml
index d306e2011..dad25f81a 100644
--- a/.github/workflows/claude.yml
+++ b/.github/workflows/claude.yml
@@ -5,11 +5,17 @@ on:
     types: [created]
   issues:
     types: [opened, assigned]
+  pull_request_review_comment:
+    types: [created]
+
+concurrency:
+  group: claude-code-${{ github.event.issue.number }}
+  cancel-in-progress: false
 
 jobs:
   claude:
     if: |
-      (github.event_name == 'issue_comment' && contains(github.event.comment.body, '@claude')) ||
+      ((github.event_name == 'issue_comment' || github.event_name == 'pull_request_review_comment') && contains(github.event.comment.body, '@claude')) ||
       (github.event_name == 'issues' && (contains(github.event.issue.body, '@claude') || contains(github.event.issue.title, '@claude')))
     runs-on: ubuntu-latest
     permissions:
@@ -43,9 +49,20 @@ jobs:
           trigger_phrase: "@claude"
           track_progress: true
           allowed_bots: ''
+
+          mcp_config: |
+            {
+              "mcpServers": {
+                "fetch": {
+                  "command": "npx",
+                  "args": ["-y", "@anthropic-ai/mcp-server-fetch@latest"]
+                }
+              }
+            }
+
           claude_args: |
             --model ${{ contains(github.event.comment.body || github.event.issue.body || '', '@claude sonnet') && 'claude-sonnet-4-5-20250929' || contains(github.event.comment.body || github.event.issue.body || '', '@claude haiku') && 'claude-haiku-4-5-20251001' || 'claude-opus-4-5-20251101' }}
-            --allowedTools "Write,Edit,Read,Glob,Grep,mcp__github__*,mcp__github_inline_comment__create_inline_comment,Bash(*,timeout=28800000)"
+            --allowedTools "Write,Edit,Read,Glob,Grep,mcp__github__*,mcp__github_inline_comment__create_inline_comment,mcp__fetch__*,Bash(*,timeout=28800000)"
           prompt: |
             REPO: ${{ github.repository }}
             PR/ISSUE NUMBER: ${{ github.event.pull_request.number || github.event.issue.number }}
@@ -67,7 +84,7 @@ jobs:
             
             You can analyze the json with:
             ```bash
-            python3 <<'EOF'\nimport json...
+            python3 <<'EOF'\nimport json \nwith open('agg_bmk.json') as f: data = json.load(f) \n# Your analysis code here \nEOF
             ```
 
             To trigger e2e tests, use the `mcp__github__run_workflow` tool to directly dispatch the e2e-tests.yml workflow.
@@ -135,3 +152,11 @@ jobs:
             After triggering, monitor the workflow run using the returned run_id.
 
             Focus on: code quality, benchmark config changes, and performance impact.
+
+            ## Web Access:
+            You have internet access via MCP servers:
+            - `mcp__fetch__fetch` - Fetch content from any URL
+
+            ### Useful Documentation URLs:
+            - **sglang**: https://docs.sglang.ai/
+            - **vLLM**: https://docs.vllm.ai/en/latest/

From 7e628ff0e58686e73dcba4006b6a785794aa2960 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 19 Jan 2026 11:52:54 -0800
Subject: [PATCH 208/214] chore(deps): bump the github-actions group with 2
 updates (#488)

---
 .github/workflows/claude.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/claude.yml b/.github/workflows/claude.yml
index dad25f81a..35c9df757 100644
--- a/.github/workflows/claude.yml
+++ b/.github/workflows/claude.yml
@@ -27,13 +27,13 @@ jobs:
     steps:
       - name: Generate GitHub App token
         id: app-token
-        uses: actions/create-github-app-token@v1
+        uses: actions/create-github-app-token@v2
         with:
           app-id: ${{ secrets.APP_ID }}
           private-key: ${{ secrets.APP_PRIVATE_KEY }}
 
       - name: Checkout repository
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
         with:
           fetch-depth: 0
           token: ${{ steps.app-token.outputs.token }}

From 518d00417b796745c49ccf7721bc247b077cdabf Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Mon, 19 Jan 2026 16:17:10 -0500
Subject: [PATCH 209/214] fix: update ep metadata in gb200 dynamo sglang
 configs to match comments (#486)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Update ep values to use the formula: EP = (NODES × 4 GPUs) / num-workers
for both dsr1-fp8-gb200-dynamo-sglang and dsr1-fp4-gb200-dynamo-sglang
configurations.

The metadata isn't used by sglang dynamo scripts (values are hardcoded),
but the frontend uses these values.

Fixes #485

Co-authored-by: claude[bot] <41898282+claude[bot]@users.noreply.github.com>
Co-authored-by: functionstackx <functionstackx@users.noreply.github.com>
---
 .github/configs/nvidia-master.yaml | 48 +++++++++++++++---------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 5ffc6f754..06c37888a 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -906,7 +906,7 @@ dsr1-fp8-gb200-dynamo-sglang:
         # tp, ep, and dp-attn do nothing because they are hardcoded in the following file:
         # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh
         tp: 1
-        ep: 1
+        ep: 8
         dp-attn: true
         additional-settings:
         - "PREFILL_NODES=4"
@@ -915,7 +915,7 @@ dsr1-fp8-gb200-dynamo-sglang:
       decode:
         num-worker: 1
         tp: 1
-        ep: 1
+        ep: 32
         dp-attn: true
         additional-settings:
         - "DECODE_NODES=8"
@@ -928,7 +928,7 @@ dsr1-fp8-gb200-dynamo-sglang:
         # tp, ep, and dp-attn do nothing because they are hardcoded in the following file:
         # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh
         tp: 1
-        ep: 1
+        ep: 4
         dp-attn: true
         additional-settings:
         - "PREFILL_NODES=1"
@@ -937,7 +937,7 @@ dsr1-fp8-gb200-dynamo-sglang:
       decode:
         num-worker: 4
         tp: 1
-        ep: 1
+        ep: 4
         dp-attn: true
         additional-settings:
         - "DECODE_NODES=4"
@@ -950,7 +950,7 @@ dsr1-fp8-gb200-dynamo-sglang:
         # tp, ep, and dp-attn do nothing because they are hardcoded in the following file:
         # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh
         tp: 1
-        ep: 1
+        ep: 8
         dp-attn: true
         additional-settings:
         - "PREFILL_NODES=6"
@@ -959,7 +959,7 @@ dsr1-fp8-gb200-dynamo-sglang:
       decode:
         num-worker: 1
         tp: 1
-        ep: 1
+        ep: 48
         dp-attn: true
         additional-settings:
         - "DECODE_NODES=12"
@@ -973,7 +973,7 @@ dsr1-fp8-gb200-dynamo-sglang:
       prefill:
         num-worker: 1
         tp: 1
-        ep: 1
+        ep: 4
         dp-attn: true
         additional-settings:
         - "PREFILL_NODES=1"
@@ -982,7 +982,7 @@ dsr1-fp8-gb200-dynamo-sglang:
       decode:
         num-worker: 1
         tp: 1
-        ep: 1
+        ep: 4
         dp-attn: true
         additional-settings:
         - "DECODE_NODES=1"
@@ -993,7 +993,7 @@ dsr1-fp8-gb200-dynamo-sglang:
       prefill:
         num-worker: 5
         tp: 1
-        ep: 1
+        ep: 8
         dp-attn: true
         additional-settings:
         - "PREFILL_NODES=10"
@@ -1002,7 +1002,7 @@ dsr1-fp8-gb200-dynamo-sglang:
       decode:
         num-worker: 1
         tp: 1
-        ep: 1
+        ep: 32
         dp-attn: true
         additional-settings:
         - "DECODE_NODES=8"
@@ -1029,7 +1029,7 @@ dsr1-fp4-gb200-dynamo-sglang:
       prefill:
         num-worker: 1
         tp: 1
-        ep: 1
+        ep: 4
         dp-attn: true
         additional-settings:
         - "PREFILL_NODES=1"
@@ -1038,7 +1038,7 @@ dsr1-fp4-gb200-dynamo-sglang:
       decode:
         num-worker: 2
         tp: 1
-        ep: 1
+        ep: 4
         dp-attn: true
         additional-settings:
         - "DECODE_NODES=2"
@@ -1049,7 +1049,7 @@ dsr1-fp4-gb200-dynamo-sglang:
       prefill:
         num-worker: 4
         tp: 1
-        ep: 1
+        ep: 4
         dp-attn: true
         additional-settings:
         - "PREFILL_NODES=4"
@@ -1058,7 +1058,7 @@ dsr1-fp4-gb200-dynamo-sglang:
       decode:
         num-worker: 1
         tp: 1
-        ep: 1
+        ep: 48
         dp-attn: true
         additional-settings:
         - "DECODE_NODES=12"
@@ -1069,7 +1069,7 @@ dsr1-fp4-gb200-dynamo-sglang:
       prefill:
         num-worker: 4
         tp: 1
-        ep: 1
+        ep: 4
         dp-attn: true
         additional-settings:
         - "PREFILL_NODES=4"
@@ -1078,7 +1078,7 @@ dsr1-fp4-gb200-dynamo-sglang:
       decode:
         num-worker: 1
         tp: 1
-        ep: 1
+        ep: 32
         dp-attn: true
         additional-settings:
         - "DECODE_NODES=8"
@@ -1090,7 +1090,7 @@ dsr1-fp4-gb200-dynamo-sglang:
       prefill:
         num-worker: 1
         tp: 1
-        ep: 1
+        ep: 4
         dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
@@ -1099,7 +1099,7 @@ dsr1-fp4-gb200-dynamo-sglang:
       decode:
         num-worker: 4
         tp: 1
-        ep: 1
+        ep: 4
         dp-attn: true
         additional-settings:
         - "DECODE_NODES=4"
@@ -1108,7 +1108,7 @@ dsr1-fp4-gb200-dynamo-sglang:
       prefill:
         num-worker: 6
         tp: 1
-        ep: 1
+        ep: 4
         dp-attn: false
         additional-settings:
         - "PREFILL_NODES=6"
@@ -1117,7 +1117,7 @@ dsr1-fp4-gb200-dynamo-sglang:
       decode:
         num-worker: 1
         tp: 1
-        ep: 1
+        ep: 48
         dp-attn: true
         additional-settings:
         - "DECODE_NODES=12"
@@ -1126,7 +1126,7 @@ dsr1-fp4-gb200-dynamo-sglang:
       prefill:
         num-worker: 10
         tp: 1
-        ep: 1
+        ep: 4
         dp-attn: true
         additional-settings:
         - "PREFILL_NODES=10"
@@ -1135,7 +1135,7 @@ dsr1-fp4-gb200-dynamo-sglang:
       decode:
         num-worker: 1
         tp: 1
-        ep: 1
+        ep: 32
         dp-attn: true
         additional-settings:
         - "DECODE_NODES=8"
@@ -1144,7 +1144,7 @@ dsr1-fp4-gb200-dynamo-sglang:
       prefill:
         num-worker: 10
         tp: 1
-        ep: 1
+        ep: 4
         dp-attn: true
         additional-settings:
         - "PREFILL_NODES=10"
@@ -1153,7 +1153,7 @@ dsr1-fp4-gb200-dynamo-sglang:
       decode:
         num-worker: 1
         tp: 1
-        ep: 1
+        ep: 32
         dp-attn: true
         additional-settings:
         - "DECODE_NODES=8"

From 388020f78ba3ceec4714a53e07ece6418d894c06 Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Mon, 19 Jan 2026 16:35:46 -0500
Subject: [PATCH 210/214] Experimental folder (increasing researcher/developer
 velocity) (#489)

---
 experimental/.gitignore          |  1 +
 experimental/README.md           |  5 +++++
 experimental/multiturn/README.md | 14 ++++++++++++++
 3 files changed, 20 insertions(+)
 create mode 100644 experimental/.gitignore
 create mode 100644 experimental/README.md
 create mode 100644 experimental/multiturn/README.md

diff --git a/experimental/.gitignore b/experimental/.gitignore
new file mode 100644
index 000000000..735d7060f
--- /dev/null
+++ b/experimental/.gitignore
@@ -0,0 +1 @@
+rocm-libraries/
\ No newline at end of file
diff --git a/experimental/README.md b/experimental/README.md
new file mode 100644
index 000000000..f39dfc4af
--- /dev/null
+++ b/experimental/README.md
@@ -0,0 +1,5 @@
+# Experimental
+
+This folder contains experimental WIP code that is mostly Claude Code generated.
+
+**Warning:** Code in this directory is very basic and likely contains errors or incomplete implementations. It is not intended for production use or as part of the official InferenceMAX results.
diff --git a/experimental/multiturn/README.md b/experimental/multiturn/README.md
new file mode 100644
index 000000000..358b53991
--- /dev/null
+++ b/experimental/multiturn/README.md
@@ -0,0 +1,14 @@
+## Experimental WIP: Multi turn with/without CPU KVCache Offloading
+
+lit review
+- https://lmsys.org/blog/2025-09-10-sglang-hicache/
+-  sglang calls GPU HBM as (L1) and CPU DRAM as (L2)
+- https://lmsys.org/images/blog/hicache/mooncake_benchmark.png
+- single turn long context Q&A  https://arxiv.org/abs/2311.04939 (seems more like an shared prefix style similar to cascade attention (pre cursor to sglang radix attention )) https://flashinfer.ai/2024/02/02/cascade-inference.html
+- Production Alibiba Multi turn dataset https://arxiv.org/abs/2506.02634 (seem to not provide the acutal prompts and outputs tho, more just prompt lengths and output lengths, etc.)
+- sglang synthetic multi turn benchmark script here https://github.com/sgl-project/sglang/tree/main/benchmark/hicache
+- interestingly sglang blog simulates PD disagg via just setting OSL as 1
+```bash
+python3 benchmark/hicache/bench_multiturn.py --model-path $MODEL_PATH --disable-random-sample \
+--output-length 1 --request-length 2048 \ # simulate P-D disaggregation
+```
\ No newline at end of file

From ef15b99f8fee104f9784a64cbdfcd2b9eb283a7e Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Wed, 21 Jan 2026 09:30:20 -0800
Subject: [PATCH 211/214] summary table

---
 utils/collect_eval_results.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py
index 8bf3cf66b..8b471034c 100644
--- a/utils/collect_eval_results.py
+++ b/utils/collect_eval_results.py
@@ -165,7 +165,7 @@ def build_row(meta: Dict[str, Any], m: Dict[str, Any]) -> Dict[str, Any]:
         'tp': int(meta.get('tp', 1)),
         'ep': int(meta.get('ep', 1)),
         'conc': int(meta.get('conc', 0)),
-        'dp_attention': str(meta.get('dp_attention', False)).lower(),
+        'dp_attention': str(meta.get('dp_attention', "none")).lower(),
         'task': m.get('task', 'unknown'),
         'em_strict': m.get('strict'),
         'em_strict_se': m.get('strict_se'),

From 62079d67994179524748b476d9f81ec1dbb2dc61 Mon Sep 17 00:00:00 2001
From: Bryan Shan <58582368+Oseltamivir@users.noreply.github.com>
Date: Wed, 21 Jan 2026 10:42:03 -0800
Subject: [PATCH 212/214] Remove git installation and repository cloning

Removed git installation check and cloning of bench_serving repository.
---
 benchmarks/benchmark_lib.sh | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 42f57f762..f48e4927c 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -183,20 +183,6 @@ run_benchmark_serving() {
         esac
     done
     
-    # Check if git is installed, install if missing
-    if ! command -v git &> /dev/null; then
-        echo "git not found, installing..."
-        if command -v apt-get &> /dev/null; then
-            sudo apt-get update && sudo apt-get install -y git
-        else
-            echo "Error: Could not install git. Package manager not found."
-            return 1
-        fi
-    fi
-
-    local BENCH_SERVING_DIR
-    BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
-    git clone https://github.com/kimbochen/bench_serving.git "$BENCH_SERVING_DIR"
     # Validate all required parameters
     if [[ -z "$model" ]]; then
         echo "Error: --model is required"

From 5409158d0a50e520581289cf3ab6971a91a60897 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Wed, 21 Jan 2026 11:33:31 -0800
Subject: [PATCH 213/214] evals final

---
 benchmarks/dsr1_fp4_b200_trt_slurm.sh  | 4 ++--
 benchmarks/dsr1_fp8_b200_trt_slurm.sh  | 4 ++--
 benchmarks/dsr1_fp8_h200_trt_slurm.sh  | 4 ++--
 benchmarks/gptoss_fp4_b200_docker.sh   | 3 +--
 benchmarks/gptoss_fp4_h100_docker.sh   | 5 +----
 benchmarks/gptoss_fp4_h100_slurm.sh    | 3 +--
 benchmarks/gptoss_fp4_h200_slurm.sh    | 3 +--
 benchmarks/gptoss_fp4_mi300x_docker.sh | 1 -
 benchmarks/gptoss_fp4_mi325x_slurm.sh  | 1 -
 benchmarks/gptoss_fp4_mi355x_docker.sh | 1 -
 10 files changed, 10 insertions(+), 19 deletions(-)

diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
index 65edb89d1..7886da1c9 100644
--- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
@@ -88,8 +88,8 @@ fi
 set -x
 
 MAX_NUM_TOKENS=$(( ($CONC+$ISL+64+63)/64*64 ))
-MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 4096 ? MAX_MODEL_LEN : 4096 ))
-MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 4096 ? MAX_NUM_TOKENS : 4096 ))
+MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 ))
+MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 ))
 
 # Launch TRT-LLM server
 mpirun -n 1 --oversubscribe --allow-run-as-root \
diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh
index 6fb75eeee..42a8cfd3e 100644
--- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh
@@ -58,8 +58,8 @@ fi
 set -x
 
 MAX_NUM_TOKENS=$(( ($CONC+$ISL+64+63)/64*64 ))
-MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 4096 ? MAX_MODEL_LEN : 4096 ))
-MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 4096 ? MAX_NUM_TOKENS : 4096 ))
+MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 ))
+MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 ))
 
 # Launch TRT-LLM server
 mpirun -n 1 --oversubscribe --allow-run-as-root \
diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
index bc8fffea7..b72df9577 100644
--- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
@@ -58,8 +58,8 @@ fi
 set -x
 
 MAX_NUM_TOKENS=$(( (CONC + ISL + 64 + 63) / 64 * 64 ))
-MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 4096 ? MAX_MODEL_LEN : 4096 ))
-MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 4096 ? MAX_NUM_TOKENS : 4096 ))
+MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 ))
+MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 ))
 
 # Launch TRT-LLM server
 PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \
diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh
index 2702cd477..841b9df7a 100644
--- a/benchmarks/gptoss_fp4_b200_docker.sh
+++ b/benchmarks/gptoss_fp4_b200_docker.sh
@@ -52,8 +52,7 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT \
 --gpu-memory-utilization 0.9 \
 --tensor-parallel-size $TP \
 --max-num-seqs 512 \
---disable-log-requests \
---served-model-name $MODEL > $SERVER_LOG 2>&1 &
+--disable-log-requests > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh
index a4c848119..aa02bf286 100644
--- a/benchmarks/gptoss_fp4_h100_docker.sh
+++ b/benchmarks/gptoss_fp4_h100_docker.sh
@@ -13,8 +13,6 @@ check_env_vars \
     RANDOM_RANGE_RATIO \
     RESULT_FILENAME
 
-
-# Create a basic vLLM config
 cat > config.yaml << EOF
 async-scheduling: true
 no-enable-prefix-caching: true
@@ -33,8 +31,7 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
 --gpu-memory-utilization=0.9 \
 --tensor-parallel-size=$TP \
 --max-num-seqs=$CONC  \
---disable-log-requests \
---served-model-name $MODEL > $SERVER_LOG 2>&1 &
+--disable-log-requests > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh
index 2e44f95ed..c89104790 100644
--- a/benchmarks/gptoss_fp4_h100_slurm.sh
+++ b/benchmarks/gptoss_fp4_h100_slurm.sh
@@ -35,8 +35,7 @@ PYTHONNOUSERSITE=1 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
   --gpu-memory-utilization=0.9 \
   --tensor-parallel-size=$TP \
   --max-num-seqs=$CONC  \
-  --disable-log-requests \
-  --served-model-name $MODEL > $SERVER_LOG 2>&1 &
+  --disable-log-requests > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh
index abe4c2daf..4504b9417 100644
--- a/benchmarks/gptoss_fp4_h200_slurm.sh
+++ b/benchmarks/gptoss_fp4_h200_slurm.sh
@@ -47,8 +47,7 @@ PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT \
  --gpu-memory-utilization 0.9 \
  --tensor-parallel-size $TP \
  --max-num-seqs $CONC  \
- --disable-log-requests \
- --served-model-name $MODEL > $SERVER_LOG 2>&1 &
+ --disable-log-requests > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh
index 1019f2086..1dfd0c343 100644
--- a/benchmarks/gptoss_fp4_mi300x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi300x_docker.sh
@@ -41,7 +41,6 @@ vllm serve $MODEL --port $PORT \
 --block-size=64 \
 --no-enable-prefix-caching \
 --disable-log-requests \
---served-model-name $MODEL \
 --async-scheduling > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh
index 16f9729ac..255bb3df5 100644
--- a/benchmarks/gptoss_fp4_mi325x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh
@@ -44,7 +44,6 @@ vllm serve $MODEL --port $PORT \
 --block-size=64 \
 --no-enable-prefix-caching \
 --disable-log-requests \
---served-model-name $MODEL \
 --async-scheduling > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh
index 0be8558a9..651f1da67 100644
--- a/benchmarks/gptoss_fp4_mi355x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi355x_docker.sh
@@ -37,7 +37,6 @@ vllm serve $MODEL --port $PORT \
 --block-size=64 \
 --no-enable-prefix-caching \
 --disable-log-requests \
---served-model-name $MODEL \
 --async-scheduling > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!

From 9ae0f9066be1ed68730aae8d177e850537e6199c Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Wed, 21 Jan 2026 14:09:51 -0800
Subject: [PATCH 214/214] more retries, lower conc, for stability

---
 benchmarks/benchmark_lib.sh                | 2 +-
 benchmarks/dsr1_fp4_b200_docker.sh         | 2 +-
 benchmarks/dsr1_fp4_b200_slurm.sh          | 2 +-
 benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh  | 2 +-
 benchmarks/dsr1_fp4_b200_trt_slurm.sh      | 2 +-
 benchmarks/dsr1_fp4_mi355x_atom_slurm.sh   | 2 +-
 benchmarks/dsr1_fp4_mi355x_docker.sh       | 2 +-
 benchmarks/dsr1_fp4_mi355x_slurm.sh        | 2 +-
 benchmarks/dsr1_fp8_b200_docker.sh         | 2 +-
 benchmarks/dsr1_fp8_b200_slurm.sh          | 2 +-
 benchmarks/dsr1_fp8_b200_trt_slurm.sh      | 2 +-
 benchmarks/dsr1_fp8_h200_slurm.sh          | 2 +-
 benchmarks/dsr1_fp8_h200_trt_slurm.sh      | 2 +-
 benchmarks/dsr1_fp8_mi300x_docker.sh       | 2 +-
 benchmarks/dsr1_fp8_mi300x_slurm.sh        | 2 +-
 benchmarks/dsr1_fp8_mi325x_docker.sh       | 2 +-
 benchmarks/dsr1_fp8_mi325x_slurm.sh        | 2 +-
 benchmarks/dsr1_fp8_mi355x_atom_slurm.sh   | 2 +-
 benchmarks/dsr1_fp8_mi355x_docker.sh       | 2 +-
 benchmarks/dsr1_fp8_mi355x_slurm.sh        | 2 +-
 benchmarks/gptoss_fp4_b200_docker.sh       | 2 +-
 benchmarks/gptoss_fp4_b200_slurm.sh        | 2 +-
 benchmarks/gptoss_fp4_h100_docker.sh       | 2 +-
 benchmarks/gptoss_fp4_h100_slurm.sh        | 2 +-
 benchmarks/gptoss_fp4_h200_slurm.sh        | 2 +-
 benchmarks/gptoss_fp4_h200_trt_slurm.sh    | 2 +-
 benchmarks/gptoss_fp4_mi300x_docker.sh     | 2 +-
 benchmarks/gptoss_fp4_mi300x_slurm.sh      | 2 +-
 benchmarks/gptoss_fp4_mi325x_docker.sh     | 2 +-
 benchmarks/gptoss_fp4_mi325x_slurm.sh      | 2 +-
 benchmarks/gptoss_fp4_mi355x_atom_slurm.sh | 2 +-
 benchmarks/gptoss_fp4_mi355x_docker.sh     | 2 +-
 benchmarks/gptoss_fp4_mi355x_slurm.sh      | 2 +-
 33 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index f48e4927c..cafa5347f 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -412,7 +412,7 @@ run_lm_eval() {
       --tasks "utils/evals/${task}.yaml" \
       --num_fewshot "${num_fewshot}" \
       --output_path "${results_dir}" --log_samples \
-      --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=</s>,max_retries=2,num_concurrent=${concurrent_requests},tokenized_requests=False,max_length=${gen_max_tokens}" \
+      --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=</s>,max_retries=5,num_concurrent=${concurrent_requests},tokenized_requests=False,max_length=${gen_max_tokens}" \
       --gen_kwargs "max_tokens=${gen_max_tokens},temperature=${temperature},top_p=${top_p}"
     local eval_exit=$?
     set +x
diff --git a/benchmarks/dsr1_fp4_b200_docker.sh b/benchmarks/dsr1_fp4_b200_docker.sh
index a5d919f9f..30e564dd9 100644
--- a/benchmarks/dsr1_fp4_b200_docker.sh
+++ b/benchmarks/dsr1_fp4_b200_docker.sh
@@ -61,7 +61,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
     append_lm_eval_summary
 fi
 set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp4_b200_slurm.sh b/benchmarks/dsr1_fp4_b200_slurm.sh
index 875ad19f5..0da2913d2 100644
--- a/benchmarks/dsr1_fp4_b200_slurm.sh
+++ b/benchmarks/dsr1_fp4_b200_slurm.sh
@@ -58,7 +58,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
     append_lm_eval_summary
 fi
 set +x
diff --git a/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh
index 104a33ca2..dce21701c 100644
--- a/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh
+++ b/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh
@@ -105,7 +105,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
     append_lm_eval_summary
 fi
 set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
index 7886da1c9..459cff1b3 100644
--- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
@@ -121,7 +121,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
     append_lm_eval_summary
 fi
 set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp4_mi355x_atom_slurm.sh b/benchmarks/dsr1_fp4_mi355x_atom_slurm.sh
index 786c879be..a63039af3 100644
--- a/benchmarks/dsr1_fp4_mi355x_atom_slurm.sh
+++ b/benchmarks/dsr1_fp4_mi355x_atom_slurm.sh
@@ -66,7 +66,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
     append_lm_eval_summary
 fi
 set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp4_mi355x_docker.sh b/benchmarks/dsr1_fp4_mi355x_docker.sh
index d0807a1c2..ba19b64e3 100644
--- a/benchmarks/dsr1_fp4_mi355x_docker.sh
+++ b/benchmarks/dsr1_fp4_mi355x_docker.sh
@@ -57,7 +57,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
     append_lm_eval_summary
 fi
 set +x
diff --git a/benchmarks/dsr1_fp4_mi355x_slurm.sh b/benchmarks/dsr1_fp4_mi355x_slurm.sh
index c381f199d..63856676e 100644
--- a/benchmarks/dsr1_fp4_mi355x_slurm.sh
+++ b/benchmarks/dsr1_fp4_mi355x_slurm.sh
@@ -58,7 +58,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
     append_lm_eval_summary
 fi
 set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp8_b200_docker.sh b/benchmarks/dsr1_fp8_b200_docker.sh
index 73497dd3c..dd19b94a0 100644
--- a/benchmarks/dsr1_fp8_b200_docker.sh
+++ b/benchmarks/dsr1_fp8_b200_docker.sh
@@ -93,7 +93,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
     append_lm_eval_summary
 fi
 set +x
diff --git a/benchmarks/dsr1_fp8_b200_slurm.sh b/benchmarks/dsr1_fp8_b200_slurm.sh
index 76549bbd5..da1a7f4cd 100644
--- a/benchmarks/dsr1_fp8_b200_slurm.sh
+++ b/benchmarks/dsr1_fp8_b200_slurm.sh
@@ -90,7 +90,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
     append_lm_eval_summary
 fi
 set +x
diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh
index 42a8cfd3e..1602d802b 100644
--- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh
@@ -91,7 +91,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
     append_lm_eval_summary
 fi
 set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp8_h200_slurm.sh b/benchmarks/dsr1_fp8_h200_slurm.sh
index 657504290..117008a63 100644
--- a/benchmarks/dsr1_fp8_h200_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_slurm.sh
@@ -62,7 +62,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
     append_lm_eval_summary
 fi
 set +x
diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
index b72df9577..98a6de420 100644
--- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
@@ -91,7 +91,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
     append_lm_eval_summary
 fi
 set +x
diff --git a/benchmarks/dsr1_fp8_mi300x_docker.sh b/benchmarks/dsr1_fp8_mi300x_docker.sh
index ca2bbdd56..c7de3eec5 100644
--- a/benchmarks/dsr1_fp8_mi300x_docker.sh
+++ b/benchmarks/dsr1_fp8_mi300x_docker.sh
@@ -60,7 +60,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
     append_lm_eval_summary
 fi
 set +x
diff --git a/benchmarks/dsr1_fp8_mi300x_slurm.sh b/benchmarks/dsr1_fp8_mi300x_slurm.sh
index 3c3c00029..f4e029fe5 100644
--- a/benchmarks/dsr1_fp8_mi300x_slurm.sh
+++ b/benchmarks/dsr1_fp8_mi300x_slurm.sh
@@ -65,7 +65,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
     append_lm_eval_summary
 fi
 set +x
diff --git a/benchmarks/dsr1_fp8_mi325x_docker.sh b/benchmarks/dsr1_fp8_mi325x_docker.sh
index be756fa6c..c990ef2a1 100644
--- a/benchmarks/dsr1_fp8_mi325x_docker.sh
+++ b/benchmarks/dsr1_fp8_mi325x_docker.sh
@@ -51,7 +51,7 @@ run_benchmark_serving \
     
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
     append_lm_eval_summary
 fi
 set +x
diff --git a/benchmarks/dsr1_fp8_mi325x_slurm.sh b/benchmarks/dsr1_fp8_mi325x_slurm.sh
index fd2fc3886..82f0833ff 100644
--- a/benchmarks/dsr1_fp8_mi325x_slurm.sh
+++ b/benchmarks/dsr1_fp8_mi325x_slurm.sh
@@ -54,7 +54,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
     append_lm_eval_summary
 fi
 set +x
diff --git a/benchmarks/dsr1_fp8_mi355x_atom_slurm.sh b/benchmarks/dsr1_fp8_mi355x_atom_slurm.sh
index 786c879be..a63039af3 100644
--- a/benchmarks/dsr1_fp8_mi355x_atom_slurm.sh
+++ b/benchmarks/dsr1_fp8_mi355x_atom_slurm.sh
@@ -66,7 +66,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
     append_lm_eval_summary
 fi
 set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp8_mi355x_docker.sh b/benchmarks/dsr1_fp8_mi355x_docker.sh
index c207802d9..f6527e9b7 100644
--- a/benchmarks/dsr1_fp8_mi355x_docker.sh
+++ b/benchmarks/dsr1_fp8_mi355x_docker.sh
@@ -55,7 +55,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
     append_lm_eval_summary
 fi
 set +x
diff --git a/benchmarks/dsr1_fp8_mi355x_slurm.sh b/benchmarks/dsr1_fp8_mi355x_slurm.sh
index a90fc1067..078a9ec48 100644
--- a/benchmarks/dsr1_fp8_mi355x_slurm.sh
+++ b/benchmarks/dsr1_fp8_mi355x_slurm.sh
@@ -53,7 +53,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
     append_lm_eval_summary
 fi
 set +x
diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh
index 841b9df7a..1a4b55a83 100644
--- a/benchmarks/gptoss_fp4_b200_docker.sh
+++ b/benchmarks/gptoss_fp4_b200_docker.sh
@@ -75,7 +75,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
     append_lm_eval_summary
 fi
 set +x
diff --git a/benchmarks/gptoss_fp4_b200_slurm.sh b/benchmarks/gptoss_fp4_b200_slurm.sh
index a7f507f53..5bcfef9a3 100644
--- a/benchmarks/gptoss_fp4_b200_slurm.sh
+++ b/benchmarks/gptoss_fp4_b200_slurm.sh
@@ -70,7 +70,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
     append_lm_eval_summary
 fi
 set +x
diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh
index aa02bf286..2fd6fc67f 100644
--- a/benchmarks/gptoss_fp4_h100_docker.sh
+++ b/benchmarks/gptoss_fp4_h100_docker.sh
@@ -54,7 +54,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
     append_lm_eval_summary
 fi
 set +x
diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh
index c89104790..1b4da9cce 100644
--- a/benchmarks/gptoss_fp4_h100_slurm.sh
+++ b/benchmarks/gptoss_fp4_h100_slurm.sh
@@ -58,7 +58,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
     append_lm_eval_summary
 fi
 set +x
diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh
index 4504b9417..cfea22b9e 100644
--- a/benchmarks/gptoss_fp4_h200_slurm.sh
+++ b/benchmarks/gptoss_fp4_h200_slurm.sh
@@ -68,7 +68,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
     append_lm_eval_summary
 fi
 set +x
diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh
index e06aaa789..875e6ae72 100644
--- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh
+++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh
@@ -77,7 +77,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
     append_lm_eval_summary
 fi
 set +x
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh
index 1dfd0c343..467a32a58 100644
--- a/benchmarks/gptoss_fp4_mi300x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi300x_docker.sh
@@ -62,7 +62,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
     append_lm_eval_summary
 fi
 set +x
diff --git a/benchmarks/gptoss_fp4_mi300x_slurm.sh b/benchmarks/gptoss_fp4_mi300x_slurm.sh
index 1f2901113..bc385c264 100644
--- a/benchmarks/gptoss_fp4_mi300x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi300x_slurm.sh
@@ -69,7 +69,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
     append_lm_eval_summary
 fi
 set +x
diff --git a/benchmarks/gptoss_fp4_mi325x_docker.sh b/benchmarks/gptoss_fp4_mi325x_docker.sh
index 64d2a7291..054f6c377 100644
--- a/benchmarks/gptoss_fp4_mi325x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi325x_docker.sh
@@ -61,7 +61,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
     append_lm_eval_summary
 fi
 set +x
diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh
index 255bb3df5..c0c9597c2 100644
--- a/benchmarks/gptoss_fp4_mi325x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh
@@ -65,7 +65,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
     append_lm_eval_summary
 fi
 set +x
diff --git a/benchmarks/gptoss_fp4_mi355x_atom_slurm.sh b/benchmarks/gptoss_fp4_mi355x_atom_slurm.sh
index 9cbf4640e..85052b1bc 100644
--- a/benchmarks/gptoss_fp4_mi355x_atom_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi355x_atom_slurm.sh
@@ -67,7 +67,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
     append_lm_eval_summary
 fi
 set +x
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh
index 651f1da67..7c708ae62 100644
--- a/benchmarks/gptoss_fp4_mi355x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi355x_docker.sh
@@ -58,7 +58,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
     append_lm_eval_summary
 fi
 set +x
diff --git a/benchmarks/gptoss_fp4_mi355x_slurm.sh b/benchmarks/gptoss_fp4_mi355x_slurm.sh
index bd9633b0c..1e5d87dba 100644
--- a/benchmarks/gptoss_fp4_mi355x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi355x_slurm.sh
@@ -61,7 +61,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC * 2 ))
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
     append_lm_eval_summary
 fi
 set +x