diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index b2379df46..f1e43a5c4 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -50,6 +50,14 @@ on:
       disagg:
         required: true
         type: string
+      run-eval:
+        type: boolean
+        required: true
+        default: false
+      random-range-ratio:
+        required: false
+        type: string
+        default: '0.8'
       ref:
         description: "Git ref (branch/sha) to checkout"
         required: false
@@ -74,6 +82,7 @@ env:
   CONC: ${{ inputs.conc }}
   SPEC_DECODING: ${{ inputs.spec-decoding }}
   DISAGG: ${{ inputs.disagg }}
+  RUN_EVAL: ${{ inputs.run-eval }}
 
 permissions:
   contents: read
@@ -82,7 +91,7 @@ jobs:
   benchmark:
     runs-on: ${{ inputs.runner }}
     timeout-minutes: 180
-    name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.framework }} ${{ inputs.precision }} tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} conc=${{ inputs.conc }} spec=${{ inputs.spec-decoding }}'
+    name: "${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.framework }} ${{ inputs.precision }} ${{ inputs.run-eval && 'eval ' || '' }}tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} conc=${{ inputs.conc }} spec=${{ inputs.spec-decoding }}"
     steps:
       - name: Resource cleanup
         run: |
@@ -113,7 +122,11 @@ jobs:
       - name: Launch job script
         env:
           RUNNER_NAME: ${{ runner.name }}
+          RUNNER_TYPE: ${{ inputs.runner }}
           RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_tp${{ env.TP }}_ep${{ env.EP_SIZE }}_dpa_${{ env.DP_ATTENTION }}_conc${{ env.CONC }}_specdecode_${{ env.SPEC_DECODING }}_${{ runner.name }}
+          # Suppress per-job eval markdown from being appended to the step summary.
+          # We'll publish a single combined eval table in the collection job instead.
+          GITHUB_STEP_SUMMARY: ''
         run: |
           bash ./runners/launch_${RUNNER_NAME%%_*}.sh
           FOUND_RESULT_FILE=
@@ -137,8 +150,27 @@ jobs:
           RUNNER_TYPE: ${{ inputs.runner }}
         run: |
           python3 utils/process_result.py
+
       - name: Upload result
         uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
         with:
           name: bmk_${{ env.RESULT_FILENAME }}
           path: agg_${{ env.RESULT_FILENAME }}.json
+
+      - name: Upload eval results (if any)
+        if: ${{ env.RUN_EVAL == 'true' }}
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
+        with:
+          name: eval_${{ env.EXP_NAME }}_${{ env.RESULT_FILENAME }}
+          path: |
+            meta_env.json
+            results*.json
+            sample*.jsonl
+          if-no-files-found: ignore
+
+      - name: Cleanup eval outputs (post-upload)
+        if: ${{ env.RUN_EVAL == 'true' }}
+        run: |
+          rm -f meta_env.json || true
+          # Remove any eval results JSONs that were moved into workspace
+          rm -f results*.json || true
diff --git a/.github/workflows/collect-evals.yml b/.github/workflows/collect-evals.yml
new file mode 100644
index 000000000..606117e79
--- /dev/null
+++ b/.github/workflows/collect-evals.yml
@@ -0,0 +1,46 @@
+name: Template - Collect Evals
+
+on:
+  workflow_call:
+    inputs:
+      result-prefix:
+        required: false
+        type: string
+        default: ''
+
+permissions:
+  contents: read
+
+jobs:
+  collect-evals:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          token: ${{ secrets.REPO_PAT }}
+          fetch-depth: 0
+
+      - name: Download eval artifacts
+        uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
+        with:
+          path: eval_results/
+          pattern: ${{ inputs.result-prefix && format('eval_{0}_*', inputs.result-prefix) || 'eval_*' }}
+
+      - name: Summarize evals
+        run: |
+          pip install tabulate
+          echo "## Eval Summary" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          python3 utils/collect_eval_results.py eval_results/ ${{ inputs.result-prefix || 'all' }} >> $GITHUB_STEP_SUMMARY
+
+      - name: Upload aggregated evals
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
+        with:
+          name: eval_results_${{ inputs.result-prefix || 'all' }}
+          path: agg_eval_${{ inputs.result-prefix || 'all' }}.json
+
+      - name: Cleanup downloaded eval artifacts
+        if: ${{ always() }}
+        run: |
+          rm -rf eval_results/ || true
diff --git a/.github/workflows/collect-results.yml b/.github/workflows/collect-results.yml
index ccc2ce4e4..5bfbde52e 100644
--- a/.github/workflows/collect-results.yml
+++ b/.github/workflows/collect-results.yml
@@ -34,7 +34,9 @@ jobs:
           python3 utils/summarize.py results/ >> $GITHUB_STEP_SUMMARY
 
       - name: Aggregate results
-        run: python3 utils/collect_results.py results/ ${{ inputs.result-prefix || 'all' }}
+        run: |
+          pip install tabulate
+          python3 utils/collect_results.py results/ ${{ inputs.result-prefix || 'all' }}
 
       - name: Upload aggregated results
         uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml
index 8bc48aed9..d87cba7bd 100644
--- a/.github/workflows/e2e-tests.yml
+++ b/.github/workflows/e2e-tests.yml
@@ -122,6 +122,7 @@ jobs:
             conc: ${{ matrix.config.conc }}
             spec-decoding: ${{ matrix.config.spec-decoding }}
             disagg: ${{ matrix.config.disagg }}
+            run-eval: ${{ matrix.config.run-eval }}
             ref: ${{ inputs.ref }}
 
     collect-results:
@@ -129,9 +130,17 @@ jobs:
         if: ${{ always() }}
         uses: ./.github/workflows/collect-results.yml
         secrets: inherit
+        with:
+            result-prefix: "bmk"
+
+    collect-evals:
+        needs: [test-sweep-multi-node, test-sweep-single-node]
+        if: ${{ always() }}
+        uses: ./.github/workflows/collect-evals.yml
+        secrets: inherit
 
     calc-success-rate:
-        needs: collect-results
+        needs: [collect-results, collect-evals]
         if: ${{ always() }}
         runs-on: ubuntu-latest
 
diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml
index e449942d1..224bae7f9 100644
--- a/.github/workflows/run-sweep.yml
+++ b/.github/workflows/run-sweep.yml
@@ -142,6 +142,7 @@ jobs:
             conc: ${{ matrix.config.conc }}
             spec-decoding: ${{ matrix.config.spec-decoding }}
             disagg: ${{ matrix.config.disagg }}
+            run-eval: ${{ matrix.config.run-eval }}
 
     sweep-single-node-1k8k:
         needs: setup
@@ -184,6 +185,21 @@ jobs:
         with:
             result-prefix: "bmk"
 
+    collect-evals:
+        needs:
+            [
+                sweep-single-node-1k1k,
+                sweep-single-node-1k8k,
+                sweep-single-node-8k1k,
+                sweep-multi-node-1k1k,
+                sweep-multi-node-1k8k,
+                sweep-multi-node-8k1k,
+                setup,
+            ]
+        if: ${{ always() && needs.setup.result != 'skipped' }}
+        uses: ./.github/workflows/collect-evals.yml
+        secrets: inherit
+
     upload-changelog-metadata:
         needs: [setup, collect-results]
         if: ${{ always() && needs.setup.result != 'skipped' }}
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index d4f5f46be..cafa5347f 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -122,7 +122,6 @@ run_benchmark_serving() {
     local use_chat_template=false
     local server_pid=""
 
-    # Parse arguments
     while [[ $# -gt 0 ]]; do
         case $1 in
             --model)
@@ -183,7 +182,7 @@ run_benchmark_serving() {
                 ;;
         esac
     done
-
+    
     # Validate all required parameters
     if [[ -z "$model" ]]; then
         echo "Error: --model is required"
@@ -287,3 +286,226 @@ run_benchmark_serving() {
 
     return $benchmark_exit_code
 }
+
+
+# ------------------------------
+# Eval (lm-eval-harness) helpers
+# ------------------------------
+
+_install_lm_eval_deps() {
+    python3 -m pip install -q --no-cache-dir "lm-eval[api]" || true
+    python3 -m pip install -q --no-cache-dir --no-deps \
+        "git+https://github.com/EleutherAI/lm-evaluation-harness.git@b315ef3b05176acc9732bb7fdec116abe1ecc476" || true
+}
+
+# Patch lm-eval filters to be robust to empty strings via sitecustomize
+_patch_lm_eval() {
+    local patch_dir
+    patch_dir="$(mktemp -d)"
+    cat > "$patch_dir/sitecustomize.py" <<'PY'
+# --- Patch LocalChatCompletion.parse_generations to handle empty content with reasoning_content ---
+import re, sys, unicodedata, json
+from lm_eval.filters import extraction as ex
+from lm_eval.models.openai_completions import LocalChatCompletion as _LCC
+
+def _le_parse_generations(outputs, **kwargs):
+      res = []
+      if not isinstance(outputs, list):
+          outputs = [outputs]
+      for out in (outputs or []):
+          try:
+              choices = out.get("choices", [])
+              tmp = ["" for _ in choices]
+              for choice in choices:
+                  idx = choice.get("index", 0)
+                  msg = (choice.get("message") or {})
+                  content = msg.get("content")
+                  if content in (None, "", []):
+                      content = msg.get("reasoning_content") or ""
+                  tmp[idx] = content
+          except Exception:
+              tmp = [""]
+          res.extend(tmp)
+      return res
+
+# Keep staticmethod semantics
+_LCC.parse_generations = staticmethod(_le_parse_generations)
+
+# --- Patch TemplateAPI.apply_chat_template to avoid injecting "type": "text" for TRT ---
+try:
+    from lm_eval.models import api_models as _api_models
+    _TemplateAPI = _api_models.TemplateAPI
+    _JsonChatStr = _api_models.JsonChatStr
+except Exception:
+    _TemplateAPI = None
+    _JsonChatStr = None
+
+if _TemplateAPI is not None and _JsonChatStr is not None:
+    _orig_apply_chat_template = _TemplateAPI.apply_chat_template
+
+    def _patched_apply_chat_template(
+        self,
+        chat_history,
+        add_generation_prompt: bool = True,
+    ):
+        """Applies a chat template to a list of chat history between user and model."""
+        if self.tokenizer_backend == "huggingface" and self.tokenized_requests:
+            return self.tokenizer.apply_chat_template(
+                chat_history,
+                tokenize=False,
+                add_generation_prompt=add_generation_prompt,
+                continue_final_message=not add_generation_prompt,
+            )
+        elif self.tokenizer_backend == "remote" and self.tokenized_requests:
+            return chat_history
+        else:
+            # NOTE: we no longer inject `"type": "text"` when tokenizer is None / non-HF
+            return _JsonChatStr(
+                json.dumps(
+                    [{**item} for item in chat_history],
+                    ensure_ascii=False,
+                )
+            )
+
+    _TemplateAPI.apply_chat_template = _patched_apply_chat_template
+PY
+    export PYTHONPATH="${patch_dir}:${PYTHONPATH:-}"
+}
+
+run_lm_eval() {
+    local port="${PORT:-8888}"
+    local task="${EVAL_TASK:-gsm8k}"
+    local num_fewshot="${NUM_FEWSHOT:-2}"
+    local results_dir="${EVAL_RESULT_DIR:-$(mktemp -d /tmp/eval_out-XXXXXX)}"
+    local gen_max_tokens=16384
+    local temperature=0
+    local top_p=1
+    local concurrent_requests=32
+
+    while [[ $# -gt 0 ]]; do
+        case $1 in
+            --port)           port="$2"; shift 2 ;;
+            --task)           task="$2"; shift 2 ;;
+            --num-fewshot)    num_fewshot="$2"; shift 2 ;;
+            --results-dir)    results_dir="$2"; shift 2 ;;
+            --gen-max-tokens) gen_max_tokens="$2"; shift 2 ;;
+            --temperature)    temperature="$2"; shift 2 ;;
+            --top-p)          top_p="$2"; shift 2 ;;
+            --concurrent-requests) concurrent_requests="$2"; shift 2 ;;
+            *)                echo "Unknown parameter: $1"; return 1 ;;
+        esac
+    done
+
+    _install_lm_eval_deps
+    _patch_lm_eval
+
+    local openai_server_base="http://0.0.0.0:${port}"
+    local openai_chat_base="${openai_server_base}/v1/chat/completions"
+    export OPENAI_API_KEY=${OPENAI_API_KEY:-EMPTY}
+    MODEL_NAME=${MODEL_NAME:-$MODEL} # Prefer MODEL_NAME, else MODEL
+
+    # Export for append_lm_eval_summary to pick up
+    export EVAL_RESULT_DIR="$results_dir"
+
+    set -x
+    python3 -m lm_eval --model local-chat-completions --apply_chat_template \
+      --tasks "utils/evals/${task}.yaml" \
+      --num_fewshot "${num_fewshot}" \
+      --output_path "${results_dir}" --log_samples \
+      --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=</s>,max_retries=5,num_concurrent=${concurrent_requests},tokenized_requests=False,max_length=${gen_max_tokens}" \
+      --gen_kwargs "max_tokens=${gen_max_tokens},temperature=${temperature},top_p=${top_p}"
+    local eval_exit=$?
+    set +x
+    return $eval_exit
+}
+
+append_lm_eval_summary() {
+    local results_dir="${EVAL_RESULT_DIR}"
+    local task="${EVAL_TASK:-gsm8k}"
+    local out_dir="${results_dir}"
+    mkdir -p "$out_dir" || true
+
+    # Write minimal meta for collectors that expect it
+    local meta_json="${out_dir}/meta_env.json"
+    local model_name="${MODEL_NAME:-$MODEL}"
+    local dp_json="false"
+    if [ "${DP_ATTENTION}" = "true" ]; then dp_json="true"; fi
+
+    # Derive framework/precision from env, fallback to parsing RESULT_FILENAME
+    # RESULT_FILENAME format (from workflow):
+    #   <exp_name>_<precision>_<framework>_tp<...>_ep<...>_dpa_<...>_conc<...>_<runner>
+    local fw="${FRAMEWORK:-}"
+    local prec="${PRECISION:-}"
+    if [[ -z "$fw" || -z "$prec" ]]; then
+        if [[ -n "${RESULT_FILENAME}" ]]; then
+            # Extract the two fields immediately before "_tp"
+            # Handles arbitrary underscores in exp_name by matching from the end
+            local parsed
+            parsed=$(echo "${RESULT_FILENAME}" | sed -n 's/.*_\([^_][^_]*\)_\([^_][^_]*\)_tp.*/\1 \2/p')
+            local p1="${parsed%% *}"
+            local p2="${parsed#* }"
+            if [[ -z "$prec" && -n "$p1" && "$p1" != "$parsed" ]]; then
+                prec="$p1"
+            fi
+            if [[ -z "$fw" && -n "$p2" && "$p2" != "$parsed" ]]; then
+                fw="$p2"
+            fi
+        fi
+    fi
+    cat > "${meta_json}" <<META
+{
+  "framework": "${fw:-unknown}",
+  "precision": "${prec:-unknown}",
+  "spec_decoding": "${SPEC_DECODING}",
+  "tp": ${TP:-1},
+  "conc": ${CONC:-1},
+  "ep": ${EP_SIZE:-1},
+  "dp_attention": ${dp_json},
+  "model": "${model_name:-}",
+  "hw": "${RUNNER_TYPE:-unknown}",
+  "isl": "${ISL:-0}",
+  "osl": "${OSL:-0}"
+}
+META
+
+    # Move eval artifacts into PWD (no new directories in workspace)
+    if [ -f "${meta_json}" ]; then
+        mv -f "${meta_json}" ./ || true
+    fi
+    if [ -d "${out_dir}" ]; then
+        while IFS= read -r -d '' jf; do
+            base=$(basename "$jf")
+            if [ "$base" != "meta_env.json" ]; then
+                mv -f "$jf" ./ || true
+            fi
+        done < <(find "${out_dir}" -type f -name "*.json*" -print0 2>/dev/null)
+    fi
+
+    # Best-effort cleanup of the temp directory
+    if [ -n "${out_dir}" ] && [ -d "${out_dir}" ]; then
+        rm -rf --one-file-system "${out_dir}" || rm -rf "${out_dir}" || true
+    fi
+
+    echo "Moved eval artifacts to: $(pwd)"
+}
+
+# ------------------------------
+# Unified eval entrypoint
+# ------------------------------
+
+run_eval() {
+    local framework="${EVAL_FRAMEWORK:-lm-eval}"
+    local forwarded=()
+
+    while [[ $# -gt 0 ]]; do
+        case "$1" in
+            --framework) framework="$2"; shift 2 ;;
+            *)           forwarded+=("$1"); shift ;;
+        esac
+    done
+
+    case "$framework" in
+        lm-eval|lm_eval) run_lm_eval "${forwarded[@]}" ;;
+        *)               echo "Unknown framework '${framework}'"; return 1 ;;
+    esac
+}
diff --git a/benchmarks/dsr1_fp4_b200_docker.sh b/benchmarks/dsr1_fp4_b200_docker.sh
index ba4c5a236..30e564dd9 100644
--- a/benchmarks/dsr1_fp4_b200_docker.sh
+++ b/benchmarks/dsr1_fp4_b200_docker.sh
@@ -58,3 +58,10 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    append_lm_eval_summary
+fi
+set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp4_b200_slurm.sh b/benchmarks/dsr1_fp4_b200_slurm.sh
index b9ce026f1..0da2913d2 100644
--- a/benchmarks/dsr1_fp4_b200_slurm.sh
+++ b/benchmarks/dsr1_fp4_b200_slurm.sh
@@ -55,3 +55,10 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    append_lm_eval_summary
+fi
+set +x
diff --git a/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh
index 33d819efa..dce21701c 100644
--- a/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh
+++ b/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh
@@ -102,3 +102,10 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/ \
     --use-chat-template
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    append_lm_eval_summary
+fi
+set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
index 9a1c1ec67..459cff1b3 100644
--- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
@@ -88,6 +88,8 @@ fi
 set -x
 
 MAX_NUM_TOKENS=$(( ($CONC+$ISL+64+63)/64*64 ))
+MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 ))
+MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 ))
 
 # Launch TRT-LLM server
 mpirun -n 1 --oversubscribe --allow-run-as-root \
@@ -116,3 +118,10 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    append_lm_eval_summary
+fi
+set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp4_mi355x_atom_slurm.sh b/benchmarks/dsr1_fp4_mi355x_atom_slurm.sh
index 412e90a66..a63039af3 100644
--- a/benchmarks/dsr1_fp4_mi355x_atom_slurm.sh
+++ b/benchmarks/dsr1_fp4_mi355x_atom_slurm.sh
@@ -64,3 +64,9 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
 
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    append_lm_eval_summary
+fi
+set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp4_mi355x_docker.sh b/benchmarks/dsr1_fp4_mi355x_docker.sh
index 1a2d1099f..ba19b64e3 100644
--- a/benchmarks/dsr1_fp4_mi355x_docker.sh
+++ b/benchmarks/dsr1_fp4_mi355x_docker.sh
@@ -54,3 +54,10 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    append_lm_eval_summary
+fi
+set +x
diff --git a/benchmarks/dsr1_fp4_mi355x_slurm.sh b/benchmarks/dsr1_fp4_mi355x_slurm.sh
index 9a2db61a7..63856676e 100644
--- a/benchmarks/dsr1_fp4_mi355x_slurm.sh
+++ b/benchmarks/dsr1_fp4_mi355x_slurm.sh
@@ -55,3 +55,10 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    append_lm_eval_summary
+fi
+set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp8_b200_docker.sh b/benchmarks/dsr1_fp8_b200_docker.sh
index 3c328b267..dd19b94a0 100644
--- a/benchmarks/dsr1_fp8_b200_docker.sh
+++ b/benchmarks/dsr1_fp8_b200_docker.sh
@@ -90,3 +90,10 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    append_lm_eval_summary
+fi
+set +x
diff --git a/benchmarks/dsr1_fp8_b200_slurm.sh b/benchmarks/dsr1_fp8_b200_slurm.sh
index 5406df238..da1a7f4cd 100644
--- a/benchmarks/dsr1_fp8_b200_slurm.sh
+++ b/benchmarks/dsr1_fp8_b200_slurm.sh
@@ -87,3 +87,10 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    append_lm_eval_summary
+fi
+set +x
diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh
index fdc70ad2e..1602d802b 100644
--- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh
@@ -58,6 +58,8 @@ fi
 set -x
 
 MAX_NUM_TOKENS=$(( ($CONC+$ISL+64+63)/64*64 ))
+MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 ))
+MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 ))
 
 # Launch TRT-LLM server
 mpirun -n 1 --oversubscribe --allow-run-as-root \
@@ -86,3 +88,10 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    append_lm_eval_summary
+fi
+set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp8_h200_slurm.sh b/benchmarks/dsr1_fp8_h200_slurm.sh
index 5d7c4d02e..117008a63 100644
--- a/benchmarks/dsr1_fp8_h200_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_slurm.sh
@@ -24,7 +24,7 @@ export TORCH_CUDA_ARCH_LIST="9.0"
 
 set -x
 if [[ $ISL -eq 1024 && $OSL -eq 1024 ]]; then
-    PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --tokenizer-path $MODEL \
+    PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL \
     --host 0.0.0.0 --port $PORT --trust-remote-code \
     --tensor-parallel-size=$TP --data-parallel-size=1 \
     --disable-radix-cache --max-running-requests 512 --cuda-graph-max-bs 512 \
@@ -33,7 +33,7 @@ if [[ $ISL -eq 1024 && $OSL -eq 1024 ]]; then
     --decode-log-interval 1 \
     > $SERVER_LOG 2>&1 &
 else
-    PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --tokenizer-path $MODEL \
+    PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL \
     --host 0.0.0.0 --port $PORT --trust-remote-code \
     --tensor-parallel-size=$TP --data-parallel-size=1 \
     --disable-radix-cache --max-running-requests 256 --cuda-graph-max-bs 256 \
@@ -59,3 +59,10 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    append_lm_eval_summary
+fi
+set +x
diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
index 9c2a431fc..98a6de420 100644
--- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
@@ -57,7 +57,9 @@ fi
 
 set -x
 
-MAX_NUM_TOKENS=$(( ($CONC+$ISL+64+63)/64*64 ))
+MAX_NUM_TOKENS=$(( (CONC + ISL + 64 + 63) / 64 * 64 ))
+MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 ))
+MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 ))
 
 # Launch TRT-LLM server
 PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \
@@ -86,3 +88,10 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    append_lm_eval_summary
+fi
+set +x
diff --git a/benchmarks/dsr1_fp8_mi300x_docker.sh b/benchmarks/dsr1_fp8_mi300x_docker.sh
index 40481d5d3..c7de3eec5 100644
--- a/benchmarks/dsr1_fp8_mi300x_docker.sh
+++ b/benchmarks/dsr1_fp8_mi300x_docker.sh
@@ -57,3 +57,10 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    append_lm_eval_summary
+fi
+set +x
diff --git a/benchmarks/dsr1_fp8_mi300x_slurm.sh b/benchmarks/dsr1_fp8_mi300x_slurm.sh
index a27c1000e..f4e029fe5 100644
--- a/benchmarks/dsr1_fp8_mi300x_slurm.sh
+++ b/benchmarks/dsr1_fp8_mi300x_slurm.sh
@@ -62,3 +62,10 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    append_lm_eval_summary
+fi
+set +x
diff --git a/benchmarks/dsr1_fp8_mi325x_docker.sh b/benchmarks/dsr1_fp8_mi325x_docker.sh
index aaa3c83d7..c990ef2a1 100644
--- a/benchmarks/dsr1_fp8_mi325x_docker.sh
+++ b/benchmarks/dsr1_fp8_mi325x_docker.sh
@@ -48,3 +48,10 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+    
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    append_lm_eval_summary
+fi
+set +x
diff --git a/benchmarks/dsr1_fp8_mi325x_slurm.sh b/benchmarks/dsr1_fp8_mi325x_slurm.sh
index 639b960d0..82f0833ff 100644
--- a/benchmarks/dsr1_fp8_mi325x_slurm.sh
+++ b/benchmarks/dsr1_fp8_mi325x_slurm.sh
@@ -51,3 +51,10 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    append_lm_eval_summary
+fi
+set +x
diff --git a/benchmarks/dsr1_fp8_mi355x_atom_slurm.sh b/benchmarks/dsr1_fp8_mi355x_atom_slurm.sh
index 412e90a66..a63039af3 100644
--- a/benchmarks/dsr1_fp8_mi355x_atom_slurm.sh
+++ b/benchmarks/dsr1_fp8_mi355x_atom_slurm.sh
@@ -64,3 +64,9 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
 
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    append_lm_eval_summary
+fi
+set +x
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp8_mi355x_docker.sh b/benchmarks/dsr1_fp8_mi355x_docker.sh
index 906d8060e..f6527e9b7 100644
--- a/benchmarks/dsr1_fp8_mi355x_docker.sh
+++ b/benchmarks/dsr1_fp8_mi355x_docker.sh
@@ -52,3 +52,10 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    append_lm_eval_summary
+fi
+set +x
diff --git a/benchmarks/dsr1_fp8_mi355x_slurm.sh b/benchmarks/dsr1_fp8_mi355x_slurm.sh
index 4f51837fc..078a9ec48 100644
--- a/benchmarks/dsr1_fp8_mi355x_slurm.sh
+++ b/benchmarks/dsr1_fp8_mi355x_slurm.sh
@@ -50,3 +50,10 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    append_lm_eval_summary
+fi
+set +x
diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh
index d3772a484..1a4b55a83 100644
--- a/benchmarks/gptoss_fp4_b200_docker.sh
+++ b/benchmarks/gptoss_fp4_b200_docker.sh
@@ -47,9 +47,12 @@ export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 
 set -x
-vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \
---gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs 512 \
-> $SERVER_LOG 2>&1 &
+vllm serve $MODEL --host 0.0.0.0 --port $PORT \
+--config config.yaml \
+--gpu-memory-utilization 0.9 \
+--tensor-parallel-size $TP \
+--max-num-seqs 512 \
+--disable-log-requests > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -69,3 +72,10 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    append_lm_eval_summary
+fi
+set +x
diff --git a/benchmarks/gptoss_fp4_b200_slurm.sh b/benchmarks/gptoss_fp4_b200_slurm.sh
index 559cb5661..5bcfef9a3 100644
--- a/benchmarks/gptoss_fp4_b200_slurm.sh
+++ b/benchmarks/gptoss_fp4_b200_slurm.sh
@@ -67,3 +67,10 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    append_lm_eval_summary
+fi
+set +x
diff --git a/benchmarks/gptoss_fp4_b200_trt_docker.sh b/benchmarks/gptoss_fp4_b200_trt_docker.sh
index 2829a8900..9fdeb4b1f 100644
--- a/benchmarks/gptoss_fp4_b200_trt_docker.sh
+++ b/benchmarks/gptoss_fp4_b200_trt_docker.sh
@@ -87,3 +87,10 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC ))
+    append_lm_eval_summary
+fi
+set +x
diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh
index 703a2a93e..954d7ca93 100644
--- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh
+++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh
@@ -103,3 +103,10 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC ))
+    append_lm_eval_summary
+fi
+set +x
diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh
index e67486d90..2fd6fc67f 100644
--- a/benchmarks/gptoss_fp4_h100_docker.sh
+++ b/benchmarks/gptoss_fp4_h100_docker.sh
@@ -31,7 +31,7 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
 --gpu-memory-utilization=0.9 \
 --tensor-parallel-size=$TP \
 --max-num-seqs=$CONC  \
-> $SERVER_LOG 2>&1 &
+--disable-log-requests > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -51,3 +51,10 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    append_lm_eval_summary
+fi
+set +x
diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh
index 73f1f0a70..1b4da9cce 100644
--- a/benchmarks/gptoss_fp4_h100_slurm.sh
+++ b/benchmarks/gptoss_fp4_h100_slurm.sh
@@ -25,15 +25,17 @@ EOF
 
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 export TORCH_CUDA_ARCH_LIST="9.0"
+PORT=${PORT:-8888}
+
 export VLLM_MXFP4_USE_MARLIN=1
 
 set -x
 PYTHONNOUSERSITE=1 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
---config config.yaml \
---gpu-memory-utilization=0.9 \
---tensor-parallel-size=$TP \
---max-num-seqs=$CONC  \
- > $SERVER_LOG 2>&1 &
+  --config config.yaml \
+  --gpu-memory-utilization=0.9 \
+  --tensor-parallel-size=$TP \
+  --max-num-seqs=$CONC  \
+  --disable-log-requests > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -53,3 +55,10 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    append_lm_eval_summary
+fi
+set +x
diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh
index 795fd846f..cfea22b9e 100644
--- a/benchmarks/gptoss_fp4_h200_slurm.sh
+++ b/benchmarks/gptoss_fp4_h200_slurm.sh
@@ -15,7 +15,6 @@ check_env_vars \
 
 echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 
-set -x
 hf download $MODEL
 pip install datasets pandas
 
@@ -38,14 +37,17 @@ max-model-len: $CALCULATED_MAX_MODEL_LEN
 EOF
 
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+export TORCH_CUDA_ARCH_LIST="9.0"
 PORT=$(( 8888 + $PORT_OFFSET ))
 
-export TORCH_CUDA_ARCH_LIST="9.0"
 export VLLM_MXFP4_USE_MARLIN=1
 
-PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \
- --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC  \
- > $SERVER_LOG 2>&1 &
+PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT \
+ --config config.yaml \
+ --gpu-memory-utilization 0.9 \
+ --tensor-parallel-size $TP \
+ --max-num-seqs $CONC  \
+ --disable-log-requests > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -63,3 +65,10 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    append_lm_eval_summary
+fi
+set +x
diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh
index e151ee93c..875e6ae72 100644
--- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh
+++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh
@@ -43,7 +43,7 @@ print_iter_log: true
 stream_interval: 20 
 EOF
 
-mpirun -n 1 --oversubscribe --allow-run-as-root \
+PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \
 trtllm-serve $MODEL \
 --max_batch_size $CONC \
 --max_num_tokens 20000 \
@@ -74,3 +74,10 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    append_lm_eval_summary
+fi
+set +x
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh
index e66004f9d..467a32a58 100644
--- a/benchmarks/gptoss_fp4_mi300x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi300x_docker.sh
@@ -59,3 +59,10 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    append_lm_eval_summary
+fi
+set +x
diff --git a/benchmarks/gptoss_fp4_mi300x_slurm.sh b/benchmarks/gptoss_fp4_mi300x_slurm.sh
index ca7b7fc2f..bc385c264 100644
--- a/benchmarks/gptoss_fp4_mi300x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi300x_slurm.sh
@@ -35,6 +35,8 @@ export VLLM_ROCM_USE_AITER_MHA=0
 export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0
 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
 
+#
+## Start up vllm server
 set -x
 vllm serve $MODEL --port $PORT \
 --tensor-parallel-size=$TP \
@@ -64,3 +66,10 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    append_lm_eval_summary
+fi
+set +x
diff --git a/benchmarks/gptoss_fp4_mi325x_docker.sh b/benchmarks/gptoss_fp4_mi325x_docker.sh
index 8b7dd5c87..054f6c377 100644
--- a/benchmarks/gptoss_fp4_mi325x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi325x_docker.sh
@@ -58,3 +58,10 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    append_lm_eval_summary
+fi
+set +x
diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh
index 4b8532aa6..c0c9597c2 100644
--- a/benchmarks/gptoss_fp4_mi325x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh
@@ -44,8 +44,7 @@ vllm serve $MODEL --port $PORT \
 --block-size=64 \
 --no-enable-prefix-caching \
 --disable-log-requests \
---async-scheduling \
-> $SERVER_LOG 2>&1 &
+--async-scheduling > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -63,3 +62,10 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    append_lm_eval_summary
+fi
+set +x
diff --git a/benchmarks/gptoss_fp4_mi355x_atom_slurm.sh b/benchmarks/gptoss_fp4_mi355x_atom_slurm.sh
index 778a78310..85052b1bc 100644
--- a/benchmarks/gptoss_fp4_mi355x_atom_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi355x_atom_slurm.sh
@@ -65,3 +65,9 @@ run_benchmark_serving \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
 
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    append_lm_eval_summary
+fi
+set +x
\ No newline at end of file
diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh
index bff0be872..7c708ae62 100644
--- a/benchmarks/gptoss_fp4_mi355x_docker.sh
+++ b/benchmarks/gptoss_fp4_mi355x_docker.sh
@@ -55,3 +55,10 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    append_lm_eval_summary
+fi
+set +x
diff --git a/benchmarks/gptoss_fp4_mi355x_slurm.sh b/benchmarks/gptoss_fp4_mi355x_slurm.sh
index 18aa13fff..1e5d87dba 100644
--- a/benchmarks/gptoss_fp4_mi355x_slurm.sh
+++ b/benchmarks/gptoss_fp4_mi355x_slurm.sh
@@ -28,6 +28,8 @@ export VLLM_USE_AITER_UNIFIED_ATTENTION=1
 export VLLM_ROCM_USE_AITER_MHA=0
 export VLLM_ROCM_USE_AITER_FUSED_MOE_A16W4=1
 
+#
+## Start up vllm server
 set -x
 vllm serve $MODEL --port $PORT \
 --tensor-parallel-size=$TP \
@@ -56,3 +58,10 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    append_lm_eval_summary
+fi
+set +x
diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh
index a6854173c..76fcfa83b 100644
--- a/runners/launch_b200-dgxc.sh
+++ b/runners/launch_b200-dgxc.sh
@@ -31,7 +31,7 @@ docker run --rm --init --network host --name $server_name \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE -e DP_ATTENTION \
 -e NCCL_GRAPH_REGISTER=0 \
 -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
--e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO \
+-e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e RUN_EVAL -e RUNNER_TYPE \
 --entrypoint=/bin/bash \
 $(echo "$IMAGE" | sed 's/#/\//') \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh"
diff --git a/runners/launch_b200-nvd.sh b/runners/launch_b200-nvd.sh
index c6889aa1c..a0b704d03 100644
--- a/runners/launch_b200-nvd.sh
+++ b/runners/launch_b200-nvd.sh
@@ -32,7 +32,7 @@ docker run --rm --init --network host --name $server_name \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE -e DP_ATTENTION \
 -e NCCL_GRAPH_REGISTER=0 \
 -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
--e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO \
+-e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e RUN_EVAL -e RUNNER_TYPE \
 --entrypoint=/bin/bash \
 $(echo "$IMAGE" | sed 's/#/\//') \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh"
diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh
index d1ddc26de..0174087e4 100644
--- a/runners/launch_h100-cr.sh
+++ b/runners/launch_h100-cr.sh
@@ -10,7 +10,7 @@ docker run --rm --network=host --name=$server_name \
 --runtime=nvidia --gpus=all --ipc=host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \
 -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
--e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \
+-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RUN_EVAL -e RUNNER_TYPE -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \
 -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
 --entrypoint=/bin/bash \
 $IMAGE \
diff --git a/runners/launch_mi300x-amd.sh b/runners/launch_mi300x-amd.sh
index 780e5a2f0..55fffdb7c 100644
--- a/runners/launch_mi300x-amd.sh
+++ b/runners/launch_mi300x-amd.sh
@@ -14,7 +14,7 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
 -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \
--e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \
+-e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL -e RUNNER_TYPE \
 --entrypoint=/bin/bash \
 $IMAGE \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi300x_docker.sh"
diff --git a/runners/launch_mi300x-cr.sh b/runners/launch_mi300x-cr.sh
index 8fbdaee63..5bd6bd0e2 100644
--- a/runners/launch_mi300x-cr.sh
+++ b/runners/launch_mi300x-cr.sh
@@ -14,7 +14,7 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
 -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \
--e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \
+-e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL -e RUNNER_TYPE \
 --entrypoint=/bin/bash \
 $IMAGE \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi300x_docker.sh"
diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py
new file mode 100644
index 000000000..8b471034c
--- /dev/null
+++ b/utils/collect_eval_results.py
@@ -0,0 +1,265 @@
+#!/usr/bin/env python3
+import sys
+import json
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+from tabulate import tabulate
+
+# Import shared utilities from summarize
+sys.path.insert(0, str(Path(__file__).resolve().parent))
+from summarize import (
+    load_json, MODEL, HARDWARE, FRAMEWORK, PRECISION,
+    TP, EP, CONC, DP_ATTENTION, TASK, SCORE, EM_STRICT, EM_FLEXIBLE, N_EFF,
+    SPEC_DECODING
+)
+
+
+def find_eval_sets(root: Path) -> List[Path]:
+    """Return directories that contain a meta_env.json (one set per job).
+    
+    Structure: eval_results/<artifact-name>/meta_env.json
+    """
+    out: List[Path] = []
+    try:
+        for d in root.iterdir():
+            if d.is_dir() and (d / 'meta_env.json').exists():
+                out.append(d)
+    except Exception:
+        pass
+    return out
+
+
+def detect_eval_jsons(d: Path) -> Tuple[Optional[Path], Optional[Path]]:
+    """Return (lm_eval_json) if present.
+    
+    Checks immediate directory for result JSONs.
+    """
+    immediate_jsons = list(d.glob('results*.json')) + [
+        p for p in d.glob('*.json') if p.name != 'meta_env.json'
+    ]
+    
+    lm_path = None
+    le_path = None
+    
+    for p in immediate_jsons:
+        data = load_json(p)
+        if not isinstance(data, dict):
+            continue
+            
+        if 'lm_eval_version' in data:
+            # lm-eval harness - pick latest if multiple
+            if lm_path is None or p.stat().st_mtime > lm_path.stat().st_mtime:
+                lm_path = p
+                
+    return lm_path, le_path
+
+
+def extract_lm_metrics(json_path: Path) -> List[Dict[str, Any]]:
+    """Extract metrics from lm-eval harness result JSON.
+
+    Returns a list of metric dicts, one per task in the results.
+
+    Uses explicit structure from the JSON file:
+    - Task names from results keys
+    - Metric name from configs.metric_list
+    - Filter names from configs.filter_list
+    - Values from results[task][metric,filter]
+    """
+    data = load_json(json_path) or {}
+    results = data.get('results', {})
+    configs = data.get('configs', {})
+
+    if not results:
+        return []
+
+    extracted = []
+
+    for task in results.keys():
+        task_results = results[task]
+        task_config = configs.get(task, {})
+
+        # Base metric: from config's metric_list
+        metric_list = task_config.get('metric_list', [])
+        base_metric = metric_list[0]['metric'] if metric_list else 'exact_match'
+
+        # Filters: from config's filter_list
+        filter_list = task_config.get('filter_list', [])
+
+        strict_val, strict_se = None, None
+        flex_val, flex_se = None, None
+        accuracy_val, accuracy_se = None, None
+
+        # Helper to get value/stderr pair for filtered metrics
+        def get_val_se(filter_name: str) -> Tuple[Optional[float], Optional[float]]:
+            val_key = f"{base_metric},{filter_name}"
+            se_key = f"{base_metric}_stderr,{filter_name}"
+            return task_results.get(val_key), task_results.get(se_key)
+
+        # Extract metrics based on filter_list
+        if not filter_list:
+            # No filters - check for accuracy or use base metric
+            if 'acc' in task_results:
+                accuracy_val = task_results.get('acc')
+                accuracy_se = task_results.get('acc_stderr')
+            else:
+                strict_val = task_results.get(base_metric)
+                strict_se = task_results.get(f"{base_metric}_stderr")
+        else:
+            # Extract metrics for each filter
+            for f in filter_list:
+                fname = f['name']
+                if 'strict' in fname:
+                    strict_val, strict_se = get_val_se(fname)
+                elif 'flex' in fname or 'extract' in fname:
+                    flex_val, flex_se = get_val_se(fname)
+
+        # N-samples (effective count)
+        n_eff = data.get('n-samples', {}).get(task, {}).get('effective')
+
+        # Model name
+        model = (
+            data.get('model_name')
+            or task_config.get('metadata', {}).get('model')
+        )
+
+        extracted.append({
+            'task': task,
+            'strict': strict_val,
+            'strict_se': strict_se,
+            'flex': flex_val,
+            'flex_se': flex_se,
+            'accuracy': accuracy_val,
+            'accuracy_se': accuracy_se,
+            'n_eff': n_eff,
+            'model': model,
+            'source': str(json_path)
+        })
+
+    return extracted
+
+
+def pct(x: Any) -> str:
+    """Format value as percentage."""
+    try:
+        return f"{float(x)*100:.2f}%"
+    except Exception:
+        return 'N/A'
+
+
+def se(x: Any) -> str:
+    """Format stderr as percentage with ± prefix."""
+    try:
+        return f" ±{float(x)*100:.2f}%"
+    except Exception:
+        return ''
+
+
+def build_row(meta: Dict[str, Any], m: Dict[str, Any]) -> Dict[str, Any]:
+    """Build a result row from metadata and extracted metrics."""
+    row = {
+        'model': m.get('model') or meta.get('model', 'unknown'),
+        'hw': meta.get('hw', 'unknown').upper(),
+        'framework': meta.get('framework', 'unknown').lower(),
+        'precision': meta.get('precision', 'unknown').lower(),
+        'spec_decoding': meta.get('spec_decoding', 'unknown'),
+        'tp': int(meta.get('tp', 1)),
+        'ep': int(meta.get('ep', 1)),
+        'conc': int(meta.get('conc', 0)),
+        'dp_attention': str(meta.get('dp_attention', "none")).lower(),
+        'task': m.get('task', 'unknown'),
+        'em_strict': m.get('strict'),
+        'em_strict_se': m.get('strict_se'),
+        'em_flexible': m.get('flex'),
+        'em_flexible_se': m.get('flex_se'),
+        'n_eff': m.get('n_eff'),
+        'source': m.get('source'),
+    }
+
+    # Add universal score field (primary metric for unified comparison)
+    if m.get('strict') is not None:
+        row['score'] = m.get('strict')
+        row['score_name'] = 'em_strict'
+        row['score_se'] = m.get('strict_se')
+    elif m.get('accuracy') is not None:
+        row['score'] = m.get('accuracy')
+        row['score_name'] = 'accuracy'
+        row['score_se'] = m.get('accuracy_se')
+    else:
+        row['score'] = None
+        row['score_name'] = None
+        row['score_se'] = None
+
+    return row
+
+
+def main():
+    if len(sys.argv) < 3:
+        print('Usage: collect_eval_results.py <results_dir> <exp_name>')
+        sys.exit(1)
+
+    root = Path(sys.argv[1])
+    exp_name = sys.argv[2]
+
+    rows: List[Dict[str, Any]] = []
+    for d in find_eval_sets(root):
+        meta = load_json(d / 'meta_env.json') or {}
+        lm_path, le_path = detect_eval_jsons(d)
+
+        # Extract metrics (prefer lm-eval) - returns list for multi-task support
+        if lm_path:
+            metrics_list = extract_lm_metrics(lm_path)
+        else:
+            continue
+
+        if not metrics_list:
+            continue
+
+        # Build row for each task in the results
+        for m in metrics_list:
+            row = build_row(meta, m)
+            rows.append(row)
+
+    # Sort for stable output
+    rows.sort(key=lambda r: (
+        r['hw'], r['framework'], r['precision'], r.get('spec_decoding', ''), r['tp'], r['ep'], r['conc']
+    ))
+
+    if not rows:
+        print('> No eval results found to summarize.')
+    else:
+        # Print table using tabulate
+        headers = [
+            MODEL, HARDWARE, FRAMEWORK, PRECISION, SPEC_DECODING, TP, EP, CONC, DP_ATTENTION,
+            TASK, SCORE, EM_STRICT, EM_FLEXIBLE, N_EFF
+        ]
+
+        table_rows = [
+            [
+                r['model'],
+                r['hw'],
+                r['framework'].upper(),
+                r['precision'].upper(),
+                r['spec_decoding'],
+                r['tp'],
+                r['ep'],
+                r['conc'],
+                r['dp_attention'],
+                r['task'],
+                f"{pct(r['score'])}{se(r['score_se'])}",
+                f"{pct(r['em_strict'])}{se(r['em_strict_se'])}",
+                f"{pct(r['em_flexible'])}{se(r['em_flexible_se'])}",
+                r['n_eff'] or ''
+            ]
+            for r in rows
+        ]
+
+        print(tabulate(table_rows, headers=headers, tablefmt="github"))
+
+    # Write JSON aggregate
+    out_path = Path(f'agg_eval_{exp_name}.json')
+    with open(out_path, 'w') as f:
+        json.dump(rows, f, indent=2)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md
new file mode 100644
index 000000000..fcdcd5360
--- /dev/null
+++ b/utils/evals/EVALS.md
@@ -0,0 +1,26 @@
+# Evals
+
+## What?
+Quick graded QnA which measures model performance. Examples of test suites:
+- **gsm8k**: Grade school math questions
+- **gpqa**: Graduate level, Google-Proof multiple choice questions
+- **math500**: Math questions spanning topics like probability, algebra, trigonometry, and geometry.
+
+## When?
+At highest concurrency for highest TP and lowest TP, per GPU per model per ISL/OSL. Logic is defined in `mark_eval_entries` of `utils/matrix-logic/generate_sweep_configs.py`
+
+## Why?
+To verify how model outputs are affected by throughput optimizations. 
+- TP/Conc might affect model outputs
+- Check kernel implementations for correctness
+
+## How?
+- `run_eval`, definined in `benchmarks/benchmark_lib.sh`, is called in `benchmarks/*`. EleutherAI/lm-evaluation-harness(lmeval), using the same endpoint as the throughput benchmark. JSON results are processed and converted to a table with `utils/collect_eval_results.py`.
+
+## Misc
+Following files are task definitions from lmeval, more info on changes within the files
+- `utils/evals/math500.yaml`
+- `utils/evals/gsm8k.yaml`
+
+
+
diff --git a/utils/evals/gsm8k.yaml b/utils/evals/gsm8k.yaml
new file mode 100644
index 000000000..73a1f7c1e
--- /dev/null
+++ b/utils/evals/gsm8k.yaml
@@ -0,0 +1,48 @@
+# YAML from https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/gsm8k/gsm8k.yaml 
+# Changed doc_to_text so model answers properly. Also see lm-evaluation-harness#3411.
+tag:
+  - math_word_problems
+task: gsm8k
+dataset_path: gsm8k
+dataset_name: main
+output_type: generate_until
+training_split: train
+fewshot_split: train
+test_split: test
+doc_to_text: "Question: {{question}}\nEnd your answer with: #### <answer>\nAnswer:"
+doc_to_target: "{{answer}}" #" {{answer.split('### ')[-1].rstrip()}}"
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: false
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+      - "(?s).*#### "
+      - "\\.$"
+generation_kwargs:
+  until:
+    - "Question:"
+    - "</s>"
+    - "<|im_end|>"
+  do_sample: false
+  temperature: 0.0
+repeats: 1
+num_fewshot: 5
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: "#### (\\-?[0-9\\.\\,]+)"
+      - function: "take_first"
+  - name: "flexible-extract"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: "(-?[$0-9.,]{2,})|(-?[0-9]+)"
+      - function: "take_first"
+metadata:
+  version: 3.0
diff --git a/utils/evals/math500.yaml b/utils/evals/math500.yaml
new file mode 100644
index 000000000..2e172e7f0
--- /dev/null
+++ b/utils/evals/math500.yaml
@@ -0,0 +1,36 @@
+# YAML from https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/hendrycks_math/hendrycks_math_algebra.yaml
+# Changed regex and prompt
+tag:
+  - math_word_problems
+task: hendrycks_math_algebra
+dataset_path: HuggingFaceH4/MATH-500
+process_docs: !function utils.process_docs
+dataset_name: algebra
+output_type: generate_until
+training_split: train
+test_split: test
+doc_to_text: "You are solving competition math problems.\n\nFormat rules:\n- Answer in a new line that starts with `Answer: `.\n- After `Answer: `, write ONLY the answer as inline LaTeX.\n- Use ONLY ASCII LaTeX commands (e.g. \\pi, \\frac{1}{2}, -). NO Unicode symbols.\n- Do NOT wrap the answer in $, $$, \\( \\), \\[ \\], or any other delimiters.\n- Do NOT use \\displaystyle or any display-style commands. Answer only this problem, the rest are examples. Problem: {{problem}}\n"
+process_results: !function utils.process_results
+doc_to_target: "{{answer}}"
+generation_kwargs:
+  until:
+    - "Problem:"
+  do_sample: false
+  temperature: 0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    regexes_to_ignore:
+      - "\\\\left"
+      - "\\\\right"
+      - "\\s+"
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: "Answer:\\s*([^\\n]+)"
+      - function: "take_first"
+metadata:
+  version: 1.0
\ No newline at end of file
diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py
index 67b406d49..b6c2cf2f2 100644
--- a/utils/matrix_logic/generate_sweep_configs.py
+++ b/utils/matrix_logic/generate_sweep_configs.py
@@ -1,3 +1,4 @@
+from ast import For
 import json
 import argparse
 import sys
@@ -31,6 +32,77 @@ def seq_len_to_str(isl: int, osl: int) -> str:
     """
     return seq_len_itos.get((isl, osl), f"{isl}_{osl}")
 
+def mark_eval_entries(matrix_values: list[dict]) -> list[dict]:
+    """Eval selection policy (single-node only):
+    - Only consider 1k8k (isl=1024, osl=8192).
+    - For each unique (model, runner, framework, precision, isl, osl, spec-decoding):
+        - Mark highest TP with highest conc
+        - Mark lowest TP with highest conc
+        
+    Grouping includes spec-decoding so MTP (mtp) and non-MTP (none) are treated
+    independently.
+    """
+    from collections import defaultdict
+
+    # Only run evals on 1k8k
+    target_isl, target_osl = seq_len_stoi["1k8k"]
+    # Group entries by (model, runner, framework, precision, isl, osl)
+    # Only include entries that have a top-level TP (i.e., single-node schema).
+    # This avoids relying on structural hints like prefill/decode which may be
+    # reused by future single-node disaggregated modes.
+    groups = defaultdict(list)
+    for i, entry in enumerate(matrix_values):
+        # Skip entries without a top-level TP field
+        if Fields.TP.value not in entry:
+            continue
+
+        if entry.get(Fields.ISL.value) != target_isl or entry.get(Fields.OSL.value) != target_osl:
+            continue
+
+        key = (
+            entry[Fields.MODEL.value],
+            entry[Fields.RUNNER.value],
+            entry[Fields.FRAMEWORK.value],
+            entry[Fields.PRECISION.value],
+            entry[Fields.ISL.value],
+            entry[Fields.OSL.value],
+            entry[Fields.SPEC_DECODING.value]
+        )
+        groups[key].append((i, entry))
+
+    # For each group, find highest TP/highest conc and lowest TP/highest conc
+    eval_indices = set()
+    for key, entries in groups.items():
+        if not entries:
+            continue
+
+        # Find min and max TP values
+        min_tp = min(e[Fields.TP.value] for _, e in entries)
+        max_tp = max(e[Fields.TP.value] for _, e in entries)
+
+        # Find highest conc for highest TP
+        highest_tp_entries = [(i, e) for i, e in entries if e[Fields.TP.value] == max_tp]
+        if highest_tp_entries:
+            max_conc_highest_tp = max(e[Fields.CONC.value] for _, e in highest_tp_entries)
+            for i, e in highest_tp_entries:
+                if e[Fields.CONC.value] == max_conc_highest_tp:
+                    eval_indices.add(i)
+
+        # Find highest conc for lowest TP (only if different from max_tp)
+        if min_tp != max_tp:
+            lowest_tp_entries = [(i, e) for i, e in entries if e[Fields.TP.value] == min_tp]
+            if lowest_tp_entries:
+                max_conc_lowest_tp = max(e[Fields.CONC.value] for _, e in lowest_tp_entries)
+                for i, e in lowest_tp_entries:
+                    if e[Fields.CONC.value] == max_conc_lowest_tp:
+                        eval_indices.add(i)
+
+    # Mark the selected entries
+    for i, entry in enumerate(matrix_values):
+        entry[Fields.RUN_EVAL.value] = i in eval_indices
+
+    return matrix_values
+
 
 def generate_full_sweep(args, all_config_data, runner_data):
     """Generate full sweep configurations with optional filtering.
@@ -528,6 +600,17 @@ def main():
         default='.github/configs/runners.yaml',
         help='Configuration file holding runner information (YAML format, defaults to .github/configs/runners.yaml)'
     )
+    eval_group = parent_parser.add_mutually_exclusive_group()
+    eval_group.add_argument(
+        '--run-evals',
+        action='store_true',
+        help='When specified, run evals on a subset of configs (in addition to all configs).'
+    )
+    eval_group.add_argument(
+        '--evals-only',
+        action='store_true',
+        help='When specified, run ONLY the eval subset (excludes non-eval configs).'
+    )
 
     # Create main parser
     parser = argparse.ArgumentParser(
@@ -696,6 +779,13 @@ def main():
         matrix_values = generate_test_config_sweep(args, all_config_data)
     else:
         parser.error(f"Unknown command: {args.command}")
+        
+    # Handle eval options (mutually exclusive)
+    if args.run_evals or args.evals_only:
+        matrix_values = mark_eval_entries(matrix_values)
+        # IF --evals-only is specified, filter to only eval entries
+        if args.evals_only:
+            matrix_values = [e for e in matrix_values if e.get(Fields.RUN_EVAL.value, False)]
 
     print(json.dumps(matrix_values))
     return matrix_values
diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py
index ea6aa4ece..b8a102684 100644
--- a/utils/matrix_logic/validation.py
+++ b/utils/matrix_logic/validation.py
@@ -51,6 +51,9 @@ class Fields(Enum):
     EXP_NAME = 'exp-name'
     DISAGG = 'disagg'
 
+    # Eval
+    RUN_EVAL = 'run-eval'
+
 
 """
     Below is the validation logic for the OUTPUT of utils/matrix_logic/generate_sweep_configs.py, i.e., 
@@ -85,6 +88,7 @@ class SingleNodeMatrixEntry(BaseModel):
     max_model_len: int = Field(alias=Fields.MAX_MODEL_LEN.value)
     exp_name: str = Field(alias=Fields.EXP_NAME.value)
     disagg: bool
+    run_eval: bool = Field(alias=Fields.RUN_EVAL.value, default=False)
 
 
 class WorkerConfig(BaseModel):
@@ -121,6 +125,7 @@ class MultiNodeMatrixEntry(BaseModel):
     max_model_len: int = Field(alias=Fields.MAX_MODEL_LEN.value)
     exp_name: str = Field(alias=Fields.EXP_NAME.value)
     disagg: bool
+    run_eval: bool = Field(alias=Fields.RUN_EVAL.value, default=False)
 
 
 def validate_matrix_entry(entry: dict, is_multinode: bool) -> dict:
diff --git a/utils/process_changelog.py b/utils/process_changelog.py
index c156e2361..7d21047ff 100644
--- a/utils/process_changelog.py
+++ b/utils/process_changelog.py
@@ -113,6 +113,7 @@ def main():
                     *configs_to_run,
                     "--config-files",
                     *MASTER_CONFIGS,
+                    "--run-evals"
                 ],
                 capture_output=True,
                 text=True,
diff --git a/utils/summarize.py b/utils/summarize.py
index a46c2e02a..b4f4ce6a1 100644
--- a/utils/summarize.py
+++ b/utils/summarize.py
@@ -1,6 +1,7 @@
 import sys
 import json
 from pathlib import Path
+from typing import Any, Dict, Optional
 from tabulate import tabulate
 
 # Header constants
@@ -33,95 +34,122 @@
 DECODE_WORKERS = "Decode Workers"
 DECODE_GPUS = "Decode GPUs"
 
-results = []
-results_dir = Path(sys.argv[1])
-for result_path in results_dir.rglob('*.json'):
-    with open(result_path) as f:
-        result = json.load(f)
-    results.append(result)
-
-single_node_results = [r for r in results if not r['is_multinode']]
-multinode_results = [r for r in results if r['is_multinode']]
-
-# Single-node and multi-node results have different fields and therefore need to be printed separately
-if single_node_results:
-    single_node_results.sort(key=lambda r: (
-        r['infmax_model_prefix'], r['hw'], r['framework'], r['precision'], r['isl'], r['osl'], r['tp'], r['ep'], r['conc']))
-
-    single_node_headers = [
-        MODEL, SERVED_MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL, TP, EP, DP_ATTENTION,
-        CONC, TTFT, TPOT, INTERACTIVITY, E2EL, TPUT_PER_GPU, OUTPUT_TPUT_PER_GPU, INPUT_TPUT_PER_GPU
-    ]
-
-    single_node_rows = [
-        [
-            r['infmax_model_prefix'],
-            r['model'],
-            r['hw'].upper(),
-            r['framework'].upper(),
-            r['precision'].upper(),
-            r['isl'],
-            r['osl'],
-            r['tp'],
-            r['ep'],
-            r['dp_attention'],
-            r['conc'],
-            f"{r['median_ttft'] * 1000:.4f}",
-            f"{r['median_tpot'] * 1000:.4f}",
-            f"{r['median_intvty']:.4f}",
-            f"{r['median_e2el']:.4f}",
-            f"{r['tput_per_gpu']:.4f}",
-            f"{r['output_tput_per_gpu']:.4f}",
-            f"{r['input_tput_per_gpu']:.4f}",
+# Eval constants
+TASK = "Task"
+SCORE = "Score"
+EM_STRICT = "EM Strict"
+EM_FLEXIBLE = "EM Flexible"
+N_EFF = "N (eff)"
+SPEC_DECODING = "Spec Decode"
+
+
+def load_json(path: Path) -> Optional[Dict[str, Any]]:
+    """Load JSON file and return dict, or None on error."""
+    try:
+        with open(path, 'r') as f:
+            return json.load(f)
+    except Exception:
+        return None
+
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python summarize.py <results_dir>")
+        sys.exit(1)
+
+    results = []
+    results_dir = Path(sys.argv[1])
+    for result_path in results_dir.rglob('*.json'):
+        result = load_json(result_path)
+        if result and 'is_multinode' in result:
+            results.append(result)
+
+    single_node_results = [r for r in results if not r['is_multinode']]
+    multinode_results = [r for r in results if r['is_multinode']]
+
+    # Single-node and multi-node results have different fields and therefore need to be printed separately
+    if single_node_results:
+        single_node_results.sort(key=lambda r: (
+            r['infmax_model_prefix'], r['hw'], r['framework'], r['precision'], r['isl'], r['osl'], r['tp'], r['ep'], r['conc']))
+
+        single_node_headers = [
+            MODEL, SERVED_MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL, TP, EP, DP_ATTENTION,
+            CONC, TTFT, TPOT, INTERACTIVITY, E2EL, TPUT_PER_GPU, OUTPUT_TPUT_PER_GPU, INPUT_TPUT_PER_GPU
         ]
-        for r in single_node_results
-    ]
-
-    print("## Single-Node Results\n")
-    print(tabulate(single_node_rows, headers=single_node_headers, tablefmt="github"))
-    print("\n")
-
-if multinode_results:
-    multinode_results.sort(key=lambda r: (r['infmax_model_prefix'], r['hw'], r['framework'], r['precision'], r['isl'],
-                           r['osl'], r['prefill_tp'], r['prefill_ep'], r['decode_tp'], r['decode_ep'], r['conc']))
-
-    multinode_headers = [
-        MODEL, SERVED_MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL,
-        PREFILL_TP, PREFILL_EP, PREFILL_DP_ATTN, PREFILL_WORKERS, PREFILL_GPUS,
-        DECODE_TP, DECODE_EP, DECODE_DP_ATTN, DECODE_WORKERS, DECODE_GPUS,
-        CONC, TTFT, TPOT, INTERACTIVITY, E2EL, TPUT_PER_GPU, OUTPUT_TPUT_PER_GPU, INPUT_TPUT_PER_GPU
-    ]
-
-    multinode_rows = [
-        [
-            r['infmax_model_prefix'],
-            r['model'],
-            r['hw'].upper(),
-            r['framework'].upper(),
-            r['precision'].upper(),
-            r['isl'],
-            r['osl'],
-            r['prefill_tp'],
-            r['prefill_ep'],
-            r['prefill_dp_attention'],
-            r['prefill_num_workers'],
-            r['num_prefill_gpu'],
-            r['decode_tp'],
-            r['decode_ep'],
-            r['decode_dp_attention'],
-            r['decode_num_workers'],
-            r['num_decode_gpu'],
-            r['conc'],
-            f"{r['median_ttft'] * 1000:.4f}",
-            f"{r['median_tpot'] * 1000:.4f}",
-            f"{r['median_intvty']:.4f}",
-            f"{r['median_e2el']:.4f}",
-            f"{r['tput_per_gpu']:.4f}",
-            f"{r['output_tput_per_gpu']:.4f}",
-            f"{r['input_tput_per_gpu']:.4f}",
+
+        single_node_rows = [
+            [
+                r['infmax_model_prefix'],
+                r['model'],
+                r['hw'].upper(),
+                r['framework'].upper(),
+                r['precision'].upper(),
+                r['isl'],
+                r['osl'],
+                r['tp'],
+                r['ep'],
+                r['dp_attention'],
+                r['conc'],
+                f"{r['median_ttft'] * 1000:.4f}",
+                f"{r['median_tpot'] * 1000:.4f}",
+                f"{r['median_intvty']:.4f}",
+                f"{r['median_e2el']:.4f}",
+                f"{r['tput_per_gpu']:.4f}",
+                f"{r['output_tput_per_gpu']:.4f}",
+                f"{r['input_tput_per_gpu']:.4f}",
+            ]
+            for r in single_node_results
         ]
-        for r in multinode_results
-    ]
 
-    print("## Multi-Node Results\n")
-    print(tabulate(multinode_rows, headers=multinode_headers, tablefmt="github"))
+        print("## Single-Node Results\n")
+        print(tabulate(single_node_rows, headers=single_node_headers, tablefmt="github"))
+        print("\n")
+
+    if multinode_results:
+        multinode_results.sort(key=lambda r: (r['infmax_model_prefix'], r['hw'], r['framework'], r['precision'], r['isl'],
+                            r['osl'], r['prefill_tp'], r['prefill_ep'], r['decode_tp'], r['decode_ep'], r['conc']))
+
+        multinode_headers = [
+            MODEL, SERVED_MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL,
+            PREFILL_TP, PREFILL_EP, PREFILL_DP_ATTN, PREFILL_WORKERS, PREFILL_GPUS,
+            DECODE_TP, DECODE_EP, DECODE_DP_ATTN, DECODE_WORKERS, DECODE_GPUS,
+            CONC, TTFT, TPOT, INTERACTIVITY, E2EL, TPUT_PER_GPU, OUTPUT_TPUT_PER_GPU, INPUT_TPUT_PER_GPU
+        ]
+
+        multinode_rows = [
+            [
+                r['infmax_model_prefix'],
+                r['model'],
+                r['hw'].upper(),
+                r['framework'].upper(),
+                r['precision'].upper(),
+                r['isl'],
+                r['osl'],
+                r['prefill_tp'],
+                r['prefill_ep'],
+                r['prefill_dp_attention'],
+                r['prefill_num_workers'],
+                r['num_prefill_gpu'],
+                r['decode_tp'],
+                r['decode_ep'],
+                r['decode_dp_attention'],
+                r['decode_num_workers'],
+                r['num_decode_gpu'],
+                r['conc'],
+                f"{r['median_ttft'] * 1000:.4f}",
+                f"{r['median_tpot'] * 1000:.4f}",
+                f"{r['median_intvty']:.4f}",
+                f"{r['median_e2el']:.4f}",
+                f"{r['tput_per_gpu']:.4f}",
+                f"{r['output_tput_per_gpu']:.4f}",
+                f"{r['input_tput_per_gpu']:.4f}",
+            ]
+            for r in multinode_results
+        ]
+
+        print("## Multi-Node Results\n")
+        print(tabulate(multinode_rows, headers=multinode_headers, tablefmt="github"))
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file