From c031e6d19d7762de834b6d7109da4f4f0127aa6f Mon Sep 17 00:00:00 2001
From: x-senpai-x <sharmautsav0531@gmail.com>
Date: Wed, 15 Apr 2026 23:47:48 +0530
Subject: [PATCH 01/16] noir_execution_success test suite in workflow

---
 .github/workflows/noir-execution-success.yml |  83 +++
 scripts/run_noir_execution_success.sh        | 594 +++++++++++++++++++
 scripts/vendor_noir_execution_success.sh     |  41 ++
 3 files changed, 718 insertions(+)
 create mode 100644 .github/workflows/noir-execution-success.yml
 create mode 100755 scripts/run_noir_execution_success.sh
 create mode 100755 scripts/vendor_noir_execution_success.sh

diff --git a/.github/workflows/noir-execution-success.yml b/.github/workflows/noir-execution-success.yml
new file mode 100644
index 000000000..2209672c2
--- /dev/null
+++ b/.github/workflows/noir-execution-success.yml
@@ -0,0 +1,83 @@
+name: Noir Execution Success Tests
+
+# Provide a noir_ref to test against any Noir release.
+on:
+  workflow_dispatch:
+    inputs:
+      noir_ref:
+        description: "noir-lang/noir git ref (tag / branch / SHA)"
+        required: false
+        default: "v1.0.0-beta.19"
+
+env:
+  CARGO_TERM_COLOR: always
+  NOIR_REF: ${{ inputs.noir_ref || 'v1.0.0-beta.19' }}
+
+# Cancel any in-progress run on the same branch when a new one is triggered.
+concurrency:
+  group: noir-exec-success-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  noir-execution-success:
+    name: Noir execution_success suite (${{ inputs.noir_ref || 'v1.0.0-beta.19' }})
+    runs-on: [self-hosted, Linux, ARM64, provekit-build]
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Setup Rust toolchain
+        uses: moonrepo/setup-rust@v1
+        with:
+          channel: nightly-2026-03-04
+          cache-base: main
+
+      - name: Build provekit-cli
+        run: cargo build --release --bin provekit-cli
+
+      - name: Setup Noir toolchain
+        uses: noir-lang/noirup@v0.1.2
+        with:
+          toolchain: ${{ env.NOIR_REF }}
+
+      # Sparse checkout — only fetch test_programs/, not the full noir repo.
+      - name: Fetch noir test programs (sparse checkout)
+        run: |
+          tmpdir=$(mktemp -d)
+          echo "NOIR_REPO_DIR=${tmpdir}/noir" >> "$GITHUB_ENV"
+          git clone \
+            --depth 1 \
+            --filter=blob:none \
+            --sparse \
+            --branch "$NOIR_REF" \
+            https://github.com/noir-lang/noir.git "${tmpdir}/noir"
+          git -C "${tmpdir}/noir" sparse-checkout set \
+            test_programs/execution_success \
+            test_programs/test_libraries
+          echo "Cloned noir @ $(git -C ${tmpdir}/noir rev-parse HEAD)"
+
+      - name: Run execution_success suite
+        env:
+          PROVEKIT_BIN: ${{ github.workspace }}/target/release/provekit-cli
+          LOG_DIR: ${{ github.workspace }}/noir-execution-logs
+          # NOIR_REPO_DIR is set by the previous step via $GITHUB_ENV
+        run: |
+          bash scripts/run_noir_execution_success.sh
+
+      # Upload logs on every run (pass or fail) for 7 days.
+      - name: Upload test logs
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: noir-execution-logs-${{ github.run_id }}
+          path: noir-execution-logs/
+          retention-days: 7
+
+      # Always clean up the temp clone, even if the test step failed.
+      - name: Cleanup noir clone
+        if: always()
+        run: |
+          if [[ -n "${NOIR_REPO_DIR:-}" && -d "${NOIR_REPO_DIR}" ]]; then
+            rm -rf "${NOIR_REPO_DIR}"
+            echo "Cleaned up ${NOIR_REPO_DIR}"
+          fi
diff --git a/scripts/run_noir_execution_success.sh b/scripts/run_noir_execution_success.sh
new file mode 100755
index 000000000..461ffbb9a
--- /dev/null
+++ b/scripts/run_noir_execution_success.sh
@@ -0,0 +1,594 @@
+#!/usr/bin/env bash
+# run_noir_execution_success.sh
+#
+# Run the Noir execution_success test suite through provekit-cli.
+#
+# Environment variables (all optional):
+#   NOIR_REPO_DIR            Path to a cloned noir-lang/noir repo root.
+#                            When set, tests come from
+#                            NOIR_REPO_DIR/test_programs/{execution_success,test_libraries}.
+#                            When unset, falls back to the vendored path
+#                            REPO_ROOT/test-programs/noir/.
+#   PROVEKIT_BIN             Path to provekit-cli binary (default: target/release/provekit-cli)
+#   LOG_DIR                  Directory for per-test logs and summary
+#   MAX_TESTS                Cap the number of tests (0 = unlimited)
+#   TEST_FILTER              Regex filter on test name
+#   REQUIRED_NARGO_VERSION   Nargo version string to require (default 1.0.0-beta.19)
+#   ENABLE_ENUMS_FALLBACK    Retry compile with -Zenums on 'enums' feature error (0/1, default 1)
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+
+# ---------------------------------------------------------------------------
+# Resolve test corpus root (CI clone vs. local vendored copy)
+# ---------------------------------------------------------------------------
+if [[ -n "${NOIR_REPO_DIR:-}" ]]; then
+  TEST_ROOT="${NOIR_REPO_DIR}/test_programs/execution_success"
+  TEST_LIB_ROOT="${NOIR_REPO_DIR}/test_programs/test_libraries"
+else
+  NOIR_ROOT="${REPO_ROOT}/test-programs/noir"
+  TEST_ROOT="${NOIR_ROOT}/execution_success"
+  TEST_LIB_ROOT="${NOIR_ROOT}/test_libraries"
+fi
+
+PROVEKIT_BIN="${PROVEKIT_BIN:-${REPO_ROOT}/target/release/provekit-cli}"
+MAX_TESTS="${MAX_TESTS:-0}"
+REQUIRED_NARGO_VERSION="${REQUIRED_NARGO_VERSION:-1.0.0-beta.19}"
+ENABLE_ENUMS_FALLBACK="${ENABLE_ENUMS_FALLBACK:-1}"
+TEST_FILTER="${TEST_FILTER:-}"
+RUN_ID="$(date -u +%Y%m%dT%H%M%SZ)"
+LOG_DIR="${LOG_DIR:-${REPO_ROOT}/scripts/noir_execution_logs/${RUN_ID}}"
+
+if [[ "${LOG_DIR}" != /* ]]; then
+  LOG_DIR="${REPO_ROOT}/${LOG_DIR}"
+fi
+
+# ---------------------------------------------------------------------------
+# Unimplemented-blackbox skip list
+# These tests use blackbox functions not yet supported by provekit.
+# They are counted as SKIP (not FAIL) and will be added back once supported.
+# ---------------------------------------------------------------------------
+SKIP_TESTS=(
+  # BLAKE3
+  a_6
+  array_dynamic_blackbox_input
+  array_dynamic_nested_blackbox_input
+  blake3
+  conditional_1
+  conditional_regression_short_circuit
+  regression_4449
+  # ECDSA_SECP256K1
+  bench_ecdsa_secp256k1
+  ecdsa_secp256k1
+  ecdsa_secp256k1_invalid_inputs
+  ecdsa_secp256k1_invalid_pub_key_in_inactive_branch
+  # ECDSA_SECP256R1
+  ecdsa_secp256r1
+  ecdsa_secp256r1_3x
+  ecdsa_secp256r1_invalid_pub_key_in_inactive_branch
+  ecdsa_secp256r1_msg_equals_order
+  # EMBEDDED_CURVE_ADD
+  embedded_curve_ops
+  regression_5045
+  regression_7744
+  # AES128_ENCRYPT
+  aes128_encrypt
+  # BLAKE2S
+  a_7
+)
+
+# Build a fast associative-array lookup
+declare -A SKIP_SET
+for _t in "${SKIP_TESTS[@]}"; do
+  SKIP_SET["${_t}"]=1
+done
+
+if [[ ! -d "${TEST_ROOT}" ]]; then
+  echo "ERROR: Missing test corpus at ${TEST_ROOT}"
+  if [[ -z "${NOIR_REPO_DIR:-}" ]]; then
+    echo "Hint: run scripts/vendor_noir_execution_success.sh first, or set NOIR_REPO_DIR."
+  else
+    echo "Hint: check that NOIR_REPO_DIR (${NOIR_REPO_DIR}) contains test_programs/execution_success."
+  fi
+  exit 1
+fi
+
+if [[ ! -x "${PROVEKIT_BIN}" ]]; then
+  echo "Missing provekit-cli binary at ${PROVEKIT_BIN}"
+  echo "Build it first: cargo build --release --bin provekit-cli"
+  exit 1
+fi
+
+if ! command -v nargo >/dev/null 2>&1; then
+  echo "nargo is required but was not found in PATH."
+  echo "Install with noirup and set version: noirup --version v1.0.0-beta.19"
+  exit 1
+fi
+
+nargo_version="$(nargo --version)"
+if [[ "${nargo_version}" != *"${REQUIRED_NARGO_VERSION}"* ]]; then
+  echo "Unsupported nargo version: ${nargo_version}"
+  echo "Expected version containing: ${REQUIRED_NARGO_VERSION}"
+  echo "Switch with: noirup --version ${REQUIRED_NARGO_VERSION}"
+  exit 1
+fi
+
+mkdir -p "${LOG_DIR}/per_test"
+GROUPED_REPORT_FILE="${LOG_DIR}/grouped_error_report.txt"
+
+shopt -s nullglob globstar
+
+discover_test_dirs() {
+  TEST_ROOT="${TEST_ROOT}" python3 - <<'PY'
+from pathlib import Path
+import tomllib
+import os
+
+root = Path(os.environ["TEST_ROOT"])
+nargo_data = {}
+
+for nargo in root.rglob("Nargo.toml"):
+    rel = nargo.parent.relative_to(root).as_posix()
+    try:
+        data = tomllib.loads(nargo.read_text())
+    except Exception:
+        data = {}
+    nargo_data[rel] = data
+
+workspace_default_roots = set()
+for rel, data in nargo_data.items():
+    ws = data.get("workspace")
+    if isinstance(ws, dict) and "default-member" in ws:
+        workspace_default_roots.add(rel)
+
+suppressed = set()
+for ws_rel in workspace_default_roots:
+    ws_path = Path(ws_rel) if ws_rel != "." else Path()
+    for rel in nargo_data:
+        rel_path = Path(rel) if rel != "." else Path()
+        if rel_path != ws_path and ws_path in rel_path.parents:
+            suppressed.add(rel)
+
+candidates = set(workspace_default_roots)
+for rel, data in nargo_data.items():
+    if rel in suppressed:
+        continue
+
+    pkg = data.get("package")
+    if isinstance(pkg, dict) and "name" in pkg:
+        if (root / rel / "Prover.toml").is_file():
+            candidates.add(rel)
+
+for rel in sorted(candidates):
+    print(rel)
+PY
+}
+
+resolve_prover_toml() {
+  local project_dir="$1"
+  local package_name="$2"
+
+  PROJECT_DIR="${project_dir}" PACKAGE_NAME="${package_name}" python3 - <<'PY'
+from pathlib import Path
+import tomllib
+import os
+
+project_dir = Path(os.environ["PROJECT_DIR"])
+package_name = os.environ["PACKAGE_NAME"]
+
+candidates = []
+for nargo in sorted(project_dir.rglob("Nargo.toml")):
+    try:
+        data = tomllib.loads(nargo.read_text())
+    except Exception:
+        continue
+
+    pkg = data.get("package")
+    if not isinstance(pkg, dict):
+        continue
+
+    if pkg.get("name") != package_name:
+        continue
+
+    prover = nargo.parent / "Prover.toml"
+    if prover.is_file():
+        candidates.append(prover.relative_to(project_dir).as_posix())
+
+if candidates:
+    candidates.sort(key=lambda p: (p.count("/"), p))
+    print(candidates[0])
+    raise SystemExit(0)
+
+root_prover = project_dir / "Prover.toml"
+if root_prover.is_file():
+    print("Prover.toml")
+    raise SystemExit(0)
+
+all_provers = sorted(project_dir.rglob("Prover.toml"))
+if len(all_provers) == 1:
+    print(all_provers[0].relative_to(project_dir).as_posix())
+    raise SystemExit(0)
+
+print("")
+PY
+}
+
+read_workdir_package_name() {
+  local project_dir="$1"
+  PROJECT_DIR="${project_dir}" python3 - <<'PY'
+from pathlib import Path
+import tomllib
+import os
+
+nargo = Path(os.environ["PROJECT_DIR"]) / "Nargo.toml"
+if not nargo.is_file():
+    print("")
+    raise SystemExit(0)
+
+try:
+    data = tomllib.loads(nargo.read_text())
+except Exception:
+    print("")
+    raise SystemExit(0)
+
+pkg = data.get("package")
+if isinstance(pkg, dict):
+    print(pkg.get("name", ""))
+else:
+    print("")
+PY
+}
+
+relative_path() {
+  local from_dir="$1"
+  local to_path="$2"
+  FROM_DIR="${from_dir}" TO_PATH="${to_path}" python3 - <<'PY'
+import os
+print(os.path.relpath(os.environ["TO_PATH"], os.environ["FROM_DIR"]))
+PY
+}
+
+
+
+append_stage_marker() {
+  local log_file="$1"
+  local stage_name="$2"
+  local stage_status="$3"
+  printf '\n[%s] %s: %s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" "${stage_status}" "${stage_name}" >> "${log_file}"
+}
+
+mapfile -t test_dirs < <(discover_test_dirs)
+
+if [[ "${#test_dirs[@]}" -eq 0 ]]; then
+  echo "No runnable test programs found under ${TEST_ROOT}"
+  exit 1
+fi
+
+total=0
+passed=0
+failed=0
+skipped=0
+
+if [[ ! -d "${TEST_LIB_ROOT}" ]]; then
+  echo "WARNING: missing ${TEST_LIB_ROOT}; path-based dependency tests may fail."
+  echo "Run scripts/vendor_noir_execution_success.sh to vendor test_libraries as well."
+fi
+
+for test_name in "${test_dirs[@]}"; do
+  if [[ -n "${TEST_FILTER}" && ! "${test_name}" =~ ${TEST_FILTER} ]]; then
+    continue
+  fi
+
+  (( total += 1 ))
+
+  if [[ "${MAX_TESTS}" -gt 0 && "${total}" -gt "${MAX_TESTS}" ]]; then
+    break
+  fi
+
+  # leaf name (no sub-path) is what we key on in the skip set
+  leaf_name="${test_name%%/*}"
+  test_dir="${TEST_ROOT}/${test_name}"
+  safe_test_name="${test_name//\//__}"
+  # --- Unimplemented blackbox skip list: no log, no noise ---
+  if [[ "${SKIP_SET["${leaf_name}"]:-}" == "1" ]]; then
+    echo "SKIP (blackbox): ${test_name}"
+    (( skipped += 1 ))
+    continue
+  fi
+
+  test_log="${LOG_DIR}/per_test/${safe_test_name}.log"
+
+  echo ""
+  echo "==> [${total}] ${test_name}"
+
+  : > "${test_log}"
+  {
+    echo "test_name=${test_name}"
+    echo "test_dir=${test_dir}"
+    echo "run_id=${RUN_ID}"
+    echo "nargo_version=${nargo_version}"
+  } >> "${test_log}"
+
+  if [[ ! -f "${test_dir}/Nargo.toml" ]]; then
+    echo "SKIP: missing Nargo.toml"
+    append_stage_marker "${test_log}" "test" "SKIP"
+    echo "SKIP: missing Nargo.toml" >> "${test_log}"
+    (( skipped += 1 ))
+    continue
+  fi
+
+  if [[ ! -d "${TEST_LIB_ROOT}" ]] && grep -qr 'test_libraries' "${test_dir}"/Nargo.toml 2>/dev/null; then
+    echo "SKIP: missing test_libraries for relative path dependency"
+    append_stage_marker "${test_log}" "test" "SKIP"
+    echo "SKIP: missing test_libraries for relative path dependency" >> "${test_log}"
+    (( skipped += 1 ))
+    continue
+  fi
+
+  sandbox_root="$(mktemp -d)"
+  sandbox_noir_root="${sandbox_root}/test-programs/noir"
+  sandbox_exec_root="${sandbox_noir_root}/execution_success"
+  fixture_name="${test_name%%/*}"
+  fixture_src="${TEST_ROOT}/${fixture_name}"
+  fixture_dst="${sandbox_exec_root}/${fixture_name}"
+
+  mkdir -p "${sandbox_exec_root}"
+  cp -R "${fixture_src}" "${fixture_dst}"
+
+  if [[ -d "${TEST_LIB_ROOT}" ]]; then
+    mkdir -p "${sandbox_noir_root}"
+    ln -s "${TEST_LIB_ROOT}" "${sandbox_noir_root}/test_libraries"
+  fi
+
+  workdir="${sandbox_exec_root}/${test_name}"
+  echo "sandbox_root=${sandbox_root}" >> "${test_log}"
+  echo "workdir=${workdir}" >> "${test_log}"
+
+  append_stage_marker "${test_log}" "nargo compile" "START"
+  compile_ok=0
+
+  if (cd "${workdir}" && nargo compile >> "${test_log}" 2>&1); then
+    compile_ok=1
+  elif [[ "${ENABLE_ENUMS_FALLBACK}" -eq 1 ]] && grep -q "unstable feature 'enums'" "${test_log}"; then
+    append_stage_marker "${test_log}" "nargo compile -Zenums" "RETRY"
+    if (cd "${workdir}" && nargo compile -Zenums >> "${test_log}" 2>&1); then
+      compile_ok=1
+    fi
+  fi
+
+  if [[ "${compile_ok}" -ne 1 ]]; then
+    append_stage_marker "${test_log}" "nargo compile" "FAIL"
+    echo "FAIL: nargo compile"
+    echo "FAIL: nargo compile" >> "${test_log}"
+    (( failed += 1 ))
+    rm -rf "${sandbox_root}"
+    continue
+  fi
+
+  append_stage_marker "${test_log}" "nargo compile" "PASS"
+
+  compiled_jsons=("${workdir}"/target/*.json)
+  if [[ "${#compiled_jsons[@]}" -eq 0 ]]; then
+    compiled_jsons=("${sandbox_exec_root}/${fixture_name}"/target/*.json)
+  fi
+  if [[ "${#compiled_jsons[@]}" -eq 0 ]]; then
+    compiled_jsons=("${sandbox_exec_root}/${fixture_name}"/**/target/*.json)
+  fi
+  if [[ "${#compiled_jsons[@]}" -eq 0 ]]; then
+    append_stage_marker "${test_log}" "compile output check" "FAIL"
+    echo "FAIL: missing compiled target JSON after nargo compile"
+    echo "FAIL: missing compiled target JSON after nargo compile" >> "${test_log}"
+    (( failed += 1 ))
+    rm -rf "${sandbox_root}"
+    continue
+  fi
+
+  workdir_package_name="$(read_workdir_package_name "${workdir}")"
+  circuit_json_abs=""
+  if [[ -n "${workdir_package_name}" ]]; then
+    for candidate_json in "${compiled_jsons[@]}"; do
+      if [[ "$(basename "${candidate_json}" .json)" == "${workdir_package_name}" ]]; then
+        circuit_json_abs="${candidate_json}"
+        break
+      fi
+    done
+  fi
+  if [[ -z "${circuit_json_abs}" ]]; then
+    circuit_json_abs="${compiled_jsons[0]}"
+  fi
+
+  circuit_json="$(relative_path "${workdir}" "${circuit_json_abs}")"
+  package_name="$(basename "${circuit_json_abs}" .json)"
+  prover_toml_rel="$(resolve_prover_toml "${workdir}" "${package_name}")"
+
+  if [[ -z "${prover_toml_rel}" || ! -f "${workdir}/${prover_toml_rel}" ]]; then
+    append_stage_marker "${test_log}" "resolve prover.toml" "FAIL"
+    echo "FAIL: could not locate Prover.toml for compiled package ${package_name}"
+    echo "FAIL: could not locate Prover.toml for compiled package ${package_name}" >> "${test_log}"
+    (( failed += 1 ))
+    rm -rf "${sandbox_root}"
+    continue
+  fi
+
+  echo "circuit_json=${circuit_json}" >> "${test_log}"
+  echo "prover_toml=${prover_toml_rel}" >> "${test_log}"
+
+  append_stage_marker "${test_log}" "provekit-cli prepare" "START"
+  if ! (cd "${workdir}" && "${PROVEKIT_BIN}" prepare "./${circuit_json}" --pkp "./prover.pkp" --pkv "./verifier.pkv" >> "${test_log}" 2>&1); then
+    append_stage_marker "${test_log}" "provekit-cli prepare" "FAIL"
+    echo "FAIL: provekit-cli prepare"
+    echo "FAIL: provekit-cli prepare" >> "${test_log}"
+    (( failed += 1 ))
+    rm -rf "${sandbox_root}"
+    continue
+  fi
+  append_stage_marker "${test_log}" "provekit-cli prepare" "PASS"
+
+  append_stage_marker "${test_log}" "provekit-cli prove" "START"
+  if ! (cd "${workdir}" && "${PROVEKIT_BIN}" prove "./prover.pkp" "./${prover_toml_rel}" -o "./proof.np" >> "${test_log}" 2>&1); then
+    append_stage_marker "${test_log}" "provekit-cli prove" "FAIL"
+    echo "FAIL: provekit-cli prove"
+    echo "FAIL: provekit-cli prove" >> "${test_log}"
+    (( failed += 1 ))
+    rm -rf "${sandbox_root}"
+    continue
+  fi
+  append_stage_marker "${test_log}" "provekit-cli prove" "PASS"
+
+  append_stage_marker "${test_log}" "provekit-cli verify" "START"
+  if ! (cd "${workdir}" && "${PROVEKIT_BIN}" verify "./verifier.pkv" "./proof.np" >> "${test_log}" 2>&1); then
+    append_stage_marker "${test_log}" "provekit-cli verify" "FAIL"
+    echo "FAIL: provekit-cli verify"
+    echo "FAIL: provekit-cli verify" >> "${test_log}"
+    (( failed += 1 ))
+    rm -rf "${sandbox_root}"
+    continue
+  fi
+  append_stage_marker "${test_log}" "provekit-cli verify" "PASS"
+
+  echo "PASS"
+  (( passed += 1 ))
+  rm -rf "${sandbox_root}"
+  # Remove per-test log for passing tests to keep artifacts lean
+  rm -f "${test_log}"
+done
+
+attempted=$((passed + failed + skipped))
+
+echo ""
+echo "----- execution_success summary -----"
+echo "Total discovered : ${#test_dirs[@]}"
+if [[ -n "${TEST_FILTER}" ]]; then
+  echo "Test filter      : ${TEST_FILTER}"
+fi
+if [[ "${MAX_TESTS}" -gt 0 ]]; then
+  echo "Attempted limit  : ${MAX_TESTS}"
+else
+  echo "Attempted limit  : all"
+fi
+echo "Attempted        : ${attempted}"
+echo "Passed           : ${passed}"
+echo "Failed           : ${failed}"
+echo "Skipped          : ${skipped}  (${#SKIP_TESTS[@]} unimplemented-blackbox tests)"
+echo "Log directory    : ${LOG_DIR}"
+
+LOG_DIR="${LOG_DIR}" PASSED_COUNT="${passed}" python3 - <<'PY'
+from pathlib import Path
+import re
+from collections import defaultdict
+import os
+
+log_dir = Path(os.environ["LOG_DIR"])
+per_test_dir = log_dir / "per_test"
+report_file = log_dir / "grouped_error_report.txt"
+
+logs = sorted(per_test_dir.glob("*.log"))
+# PASS logs are deleted after each successful test run; read the count from the shell instead.
+status_counts = {"PASS": int(os.environ.get("PASSED_COUNT", "0")), "FAIL": 0, "SKIP": 0}
+grouped = defaultdict(list)
+stage_groups = defaultdict(list)
+
+for fp in logs:
+    text = fp.read_text(errors="replace")
+    name = fp.stem
+
+    if "SKIP:" in text:
+        status_counts["SKIP"] += 1
+        skip_reason = re.search(r"SKIP: ([^\n]+)", text)
+        reason = skip_reason.group(1).strip() if skip_reason else "unknown"
+        grouped[f"SKIP: {reason}"].append(name)
+        continue
+
+    status_counts["FAIL"] += 1
+    fail_stage_match = re.findall(r"FAIL: ([^\n]+)", text)
+    stage = fail_stage_match[-1].strip() if fail_stage_match else "unknown stage"
+    stage_groups[stage].append(name)
+
+    blackbox = re.search(r"not implemented: Other black box function: BLACKBOX::([A-Z0-9_]+)", text)
+    if blackbox:
+        grouped[f"Not implemented blackbox: {blackbox.group(1)} ({stage})"].append(name)
+        continue
+
+    if "Program must have one entry point." in text:
+        grouped[f"Program must have one entry point ({stage})"].append(name)
+        continue
+
+    panic = re.search(r"panicked at [^\n]*:\n([^\n]+)", text)
+    if panic:
+        grouped[f"Panic: {panic.group(1).strip()} ({stage})"].append(name)
+        continue
+
+    solve = re.search(r"Failed to solve program: '([^']+)'", text)
+    if solve:
+        grouped[f"Failed to solve program: {solve.group(1)} ({stage})"].append(name)
+        continue
+
+    assertion = re.search(r"Failed assertion", text)
+    if assertion:
+        grouped[f"Failed assertion ({stage})"].append(name)
+        continue
+
+    compile_error = re.search(r"^error:\s*([^\n]+)", text, flags=re.M)
+    if compile_error:
+        grouped[f"Compile error: {compile_error.group(1).strip()} ({stage})"].append(name)
+        continue
+
+    compile_bug = re.search(r"^bug:\s*([^\n]+)", text, flags=re.M)
+    if compile_bug:
+        grouped[f"Compile bug: {compile_bug.group(1).strip()} ({stage})"].append(name)
+        continue
+
+    generic_error = re.search(r"^Error:\s*([^\n]+)", text, flags=re.M)
+    if generic_error:
+        grouped[f"Error: {generic_error.group(1).strip()} ({stage})"].append(name)
+        continue
+
+    grouped[f"Unknown failure ({stage})"].append(name)
+
+with report_file.open("w") as f:
+    f.write(f"logs={len(logs)}\n")
+    f.write(f"PASS={status_counts['PASS']}\n")
+    f.write(f"FAIL={status_counts['FAIL']}\n")
+    f.write(f"SKIP={status_counts['SKIP']}\n")
+    f.write("\n[stages]\n")
+    for stage, tests in sorted(stage_groups.items(), key=lambda kv: (-len(kv[1]), kv[0])):
+        f.write(f"{stage}\t{len(tests)}\t{', '.join(tests)}")
+        f.write("\n")
+    f.write("\n[grouped]\n")
+    for key, tests in sorted(grouped.items(), key=lambda kv: (-len(kv[1]), kv[0])):
+        f.write(f"{len(tests)}\t{key}\t{', '.join(tests)}")
+        f.write("\n")
+PY
+
+# Emit GitHub Step Summary when running inside Actions
+# (must be after the Python report generator so grouped_error_report.txt exists)
+if [[ -n "${GITHUB_STEP_SUMMARY:-}" ]]; then
+  {
+    echo "## Noir execution_success — ${RUN_ID}"
+    echo ""
+    echo "| Metric | Count |"
+    echo "|--------|------|"
+    echo "| Discovered | ${#test_dirs[@]} |"
+    echo "| Attempted  | ${attempted} |"
+    echo "| ✅ Passed  | ${passed} |"
+    echo "| ❌ Failed  | ${failed} |"
+    echo "| ⏭️ Skipped  | ${skipped} (${#SKIP_TESTS[@]} unimplemented blackboxes) |"
+    if [[ ${failed} -gt 0 ]]; then
+      echo ""
+      echo "### Failure groups"
+      echo '```'
+      cat "${GROUPED_REPORT_FILE}" 2>/dev/null || echo "(no grouped report)"
+      echo '```'
+    fi
+  } >> "${GITHUB_STEP_SUMMARY}"
+fi
+
+echo "Grouped report  : ${GROUPED_REPORT_FILE}"
+
+if [[ "${failed}" -gt 0 ]]; then
+  exit 1
+fi
+
+exit 0
diff --git a/scripts/vendor_noir_execution_success.sh b/scripts/vendor_noir_execution_success.sh
new file mode 100755
index 000000000..2fd2dddcc
--- /dev/null
+++ b/scripts/vendor_noir_execution_success.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+DEST_EXEC_DIR="${REPO_ROOT}/test-programs/noir/execution_success"
+DEST_LIB_DIR="${REPO_ROOT}/test-programs/noir/test_libraries"
+NOIR_REF="${NOIR_REF:-master}"
+
+tmpdir="$(mktemp -d)"
+cleanup() {
+  rm -rf "${tmpdir}"
+}
+trap cleanup EXIT
+
+echo "Vendoring noir-lang/noir:test_programs/{execution_success,test_libraries} (ref: ${NOIR_REF})"
+
+git clone --depth 1 --filter=blob:none --sparse --branch "${NOIR_REF}" \
+  "https://github.com/noir-lang/noir.git" "${tmpdir}/noir"
+git -C "${tmpdir}/noir" sparse-checkout set \
+  "test_programs/execution_success" \
+  "test_programs/test_libraries"
+
+mkdir -p "$(dirname "${DEST_EXEC_DIR}")"
+rm -rf "${DEST_EXEC_DIR}" "${DEST_LIB_DIR}"
+cp -R "${tmpdir}/noir/test_programs/execution_success" "${DEST_EXEC_DIR}"
+cp -R "${tmpdir}/noir/test_programs/test_libraries" "${DEST_LIB_DIR}"
+
+source_commit="$(git -C "${tmpdir}/noir" rev-parse HEAD)"
+generated_at="$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
+
+cat > "${REPO_ROOT}/test-programs/noir/execution_success.SOURCE" <<EOF
+repository=https://github.com/noir-lang/noir
+ref=${NOIR_REF}
+commit=${source_commit}
+generated_at_utc=${generated_at}
+source_paths=test_programs/execution_success,test_programs/test_libraries
+EOF
+
+echo "Done. Vendored files updated at ${DEST_EXEC_DIR} and ${DEST_LIB_DIR}"
+echo "Source metadata written to test-programs/noir/execution_success.SOURCE"

From 120dd272c5a80d12eeeddb288796aa5033c423eb Mon Sep 17 00:00:00 2001
From: x-senpai-x <sharmautsav0531@gmail.com>
Date: Thu, 16 Apr 2026 12:33:12 +0530
Subject: [PATCH 02/16] copilot issues addressed

---
 .github/workflows/noir-execution-success.yml | 12 +++++++-----
 scripts/run_noir_execution_success.sh        | 16 ++++++++++++++++
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/noir-execution-success.yml b/.github/workflows/noir-execution-success.yml
index 2209672c2..3e6cd267b 100644
--- a/.github/workflows/noir-execution-success.yml
+++ b/.github/workflows/noir-execution-success.yml
@@ -5,7 +5,7 @@ on:
   workflow_dispatch:
     inputs:
       noir_ref:
-        description: "noir-lang/noir git ref (tag / branch / SHA)"
+        description: "noir-lang/noir release tag (e.g. v1.0.0-beta.19)"
         required: false
         default: "v1.0.0-beta.19"
 
@@ -44,6 +44,8 @@ jobs:
       - name: Fetch noir test programs (sparse checkout)
         run: |
           tmpdir=$(mktemp -d)
+          # Export the parent so the cleanup step can remove it entirely.
+          echo "NOIR_TMPDIR=${tmpdir}" >> "$GITHUB_ENV"
           echo "NOIR_REPO_DIR=${tmpdir}/noir" >> "$GITHUB_ENV"
           git clone \
             --depth 1 \
@@ -54,7 +56,7 @@ jobs:
           git -C "${tmpdir}/noir" sparse-checkout set \
             test_programs/execution_success \
             test_programs/test_libraries
-          echo "Cloned noir @ $(git -C ${tmpdir}/noir rev-parse HEAD)"
+          echo "Cloned noir @ $(git -C "${tmpdir}/noir" rev-parse HEAD)"
 
       - name: Run execution_success suite
         env:
@@ -77,7 +79,7 @@ jobs:
       - name: Cleanup noir clone
         if: always()
         run: |
-          if [[ -n "${NOIR_REPO_DIR:-}" && -d "${NOIR_REPO_DIR}" ]]; then
-            rm -rf "${NOIR_REPO_DIR}"
-            echo "Cleaned up ${NOIR_REPO_DIR}"
+          if [[ -n "${NOIR_TMPDIR:-}" && -d "${NOIR_TMPDIR}" ]]; then
+            rm -rf "${NOIR_TMPDIR}"
+            echo "Cleaned up ${NOIR_TMPDIR}"
           fi
diff --git a/scripts/run_noir_execution_success.sh b/scripts/run_noir_execution_success.sh
index 461ffbb9a..09b7deca0 100755
--- a/scripts/run_noir_execution_success.sh
+++ b/scripts/run_noir_execution_success.sh
@@ -115,6 +115,12 @@ if [[ "${nargo_version}" != *"${REQUIRED_NARGO_VERSION}"* ]]; then
   exit 1
 fi
 
+if ! python3 -c "import tomllib" 2>/dev/null; then
+  echo "ERROR: python3.11+ is required (tomllib not found)."
+  echo "Current: $(python3 --version 2>&1)"
+  exit 1
+fi
+
 mkdir -p "${LOG_DIR}/per_test"
 GROUPED_REPORT_FILE="${LOG_DIR}/grouped_error_report.txt"
 
@@ -271,6 +277,15 @@ passed=0
 failed=0
 skipped=0
 
+# Clean up the active test sandbox if the script exits unexpectedly (SIGINT, error).
+_current_sandbox=""
+_cleanup_sandbox() {
+  if [[ -n "${_current_sandbox:-}" && -d "${_current_sandbox}" ]]; then
+    rm -rf "${_current_sandbox}"
+  fi
+}
+trap _cleanup_sandbox EXIT INT TERM
+
 if [[ ! -d "${TEST_LIB_ROOT}" ]]; then
   echo "WARNING: missing ${TEST_LIB_ROOT}; path-based dependency tests may fail."
   echo "Run scripts/vendor_noir_execution_success.sh to vendor test_libraries as well."
@@ -328,6 +343,7 @@ for test_name in "${test_dirs[@]}"; do
   fi
 
   sandbox_root="$(mktemp -d)"
+  _current_sandbox="${sandbox_root}"
   sandbox_noir_root="${sandbox_root}/test-programs/noir"
   sandbox_exec_root="${sandbox_noir_root}/execution_success"
   fixture_name="${test_name%%/*}"

From 3541e6c80d9a8ee3f622e91379660221a467afe5 Mon Sep 17 00:00:00 2001
From: x-senpai-x <sharmautsav0531@gmail.com>
Date: Tue, 21 Apr 2026 11:41:47 +0530
Subject: [PATCH 03/16] added mavros comparison

---
 scripts/generate_witness_comparison.py | 319 +++++++++++++++++++++++++
 scripts/run_noir_execution_success.sh  |  23 ++
 2 files changed, 342 insertions(+)
 create mode 100644 scripts/generate_witness_comparison.py

diff --git a/scripts/generate_witness_comparison.py b/scripts/generate_witness_comparison.py
new file mode 100644
index 000000000..fdbb6ada0
--- /dev/null
+++ b/scripts/generate_witness_comparison.py
@@ -0,0 +1,319 @@
+#!/usr/bin/env python3
+"""Generate Mavros vs ProveKit witness count comparison table.
+
+Usage: python3 generate_witness_comparison.py <witness_csv> <output_dir>
+
+Reads provekit_witness_counts.csv produced by run_noir_execution_success.sh,
+joins against Mavros Cols from the live reilabs/mavros STATUS.md (with
+hardcoded fallback for any entries absent from the live file), and writes
+witness_comparison.md to <output_dir>.
+"""
+
+import csv
+import sys
+import urllib.request
+from pathlib import Path
+
+# Mavros Cols — extracted from reilabs/mavros STATUS.md (column "Cols"),
+# noir/test_programs/execution_success/* rows only. Keys are bare circuit names.
+MAVROS_COLS: dict[str, int] = {
+    "a_1327_concrete_in_generic": 4,
+    "a_1_mul": 577,
+    "a_2_div": 610,
+    "a_3_add": 567,
+    "a_4_sub": 670,
+    "a_5_over": 798,
+    "a_6_array": 5619,
+    "arithmetic_binary_operations": 654,
+    "array_dedup_regression": 523,
+    "array_eq": 161,
+    "array_neq": 161,
+    "array_oob_regression_7965": 551,
+    "array_oob_regression_7975": 6,
+    "array_rc_regression_7842": 1,
+    "array_with_refs_from_param": 3,
+    "as_witness": 3,
+    "assert": 4,
+    "assert_statement": 3,
+    "assign_ex": 3,
+    "bit_and": 1840,
+    "bit_not": 523,
+    "bool_not": 2,
+    "bool_or": 5,
+    "break_and_continue": 1,
+    "brillig_acir_as_brillig": 541,
+    "brillig_array_ifelse": 8,
+    "brillig_arrays": 4,
+    "brillig_block_parameter_liveness": 14119,
+    "brillig_calls": 541,
+    "brillig_calls_array": 550,
+    "brillig_calls_conditionals": 577,
+    "brillig_conditional": 5,
+    "brillig_constant_reference_regression": 532,
+    "brillig_cow": 543,
+    "brillig_cow_assign": 1,
+    "brillig_fns_as_values": 593,
+    "brillig_identity_function": 8,
+    "brillig_loop_size_regression": 3,
+    "brillig_nested_arrays": 533,
+    "brillig_not": 9,
+    "brillig_recursion": 532,
+    "brillig_recursive_main": 525,
+    "brillig_recursive_main_indirect": 525,
+    "brillig_uninitialized_arrays": 790,
+    "cast_bool": 5,
+    "cast_to_i8_regression_7776": 648,
+    "cast_to_u64_regression_7776": 662,
+    "cast_to_u8_regression_7776": 648,
+    "comptime_println": 1,
+    "comptime_println_fmtstr_with_quoted": 1,
+    "comptime_variable_at_runtime": 1,
+    "conditional_regression_547": 3,
+    "conditional_regression_underflow": 517,
+    "custom_entry": 2,
+    "databus": 610,
+    "databus_two_calldata_simple": 619,
+    "debug_logs": 3,
+    "diamond_deps_0": 4,
+    "division_by_max": 528,
+    "do_not_capture_comptime_locals": 1,
+    "double_neg_cond_bool_input": 4,
+    "double_neg_cond_global_var": 809,
+    "dual_constrained_lambdas": 3,
+    "field_attribute": 533,
+    "fmtstr_with_global": 1,
+    "fold_after_inlined_calls": 539,
+    "fold_basic": 9,
+    "fold_basic_nested_call": 7,
+    "fold_distinct_return": 540,
+    "generics": 3,
+    "global_consts": 175,
+    "global_nested_array_regression_9270": 526,
+    "global_var_entry_point_used_in_another_entry": 3,
+    "global_var_func_with_multiple_entry_points": 3,
+    "global_var_multiple_entry_points_nested": 3,
+    "inline_never_basic": 5,
+    "integer_array_indexing": 527,
+    "lambda_taking_lambda_with_variant": 548,
+    "last_uses_regression_8935": 524,
+    "loop": 524,
+    "loop_break_regression_8319": 1,
+    "loop_invariant_nested_deep": 2,
+    "loop_invariant_regression_8586": 2,
+    "loop_small_break": 2,
+    "main_return": 3,
+    "modules": 5,
+    "modules_more": 5,
+    "modulus": 864,
+    "mutate_array_copy": 1,
+    "negated_jmpif_condition": 5,
+    "negative_associated_constants": 1,
+    "nested_array_with_refs": 2,
+    "nested_array_with_refs_from_param": 3,
+    "nested_arrays_from_brillig": 19,
+    "nested_fmtstr": 1,
+    "no_predicates_basic": 5,
+    "no_predicates_brillig": 532,
+    "poseidon_bn254_hash_width_3": 552,
+    "poseidonsponge_x5_254": 608,
+    "pred_eq": 5,
+    "prelude": 1,
+    "reference_alias_in_array": 1,
+    "regression_10197": 523,
+    "regression_10307": 531,
+    "regression_10466": 1,
+    "regression_10516": 1,
+    "regression_10690": 1,
+    "regression_10917": 3,
+    "regression_10923": 5,
+    "regression_2660": 572,
+    "regression_3051": 1,
+    "regression_3394": 1,
+    "regression_3607": 596,
+    "regression_3889": 5,
+    "regression_4088": 2,
+    "regression_4124": 2,
+    "regression_4202": 550,
+    "regression_4663": 1,
+    "regression_5435": 3,
+    "regression_5615": 1,
+    "regression_6451": 11,
+    "regression_6674_1": 1,
+    "regression_6674_2": 1,
+    "regression_6734": 1,
+    "regression_6990": 1,
+    "regression_7143": 532,
+    "regression_7195": 12,
+    "regression_7451": 1304,
+    "regression_7962": 1461,
+    "regression_8174": 569,
+    "regression_8212": 524,
+    "regression_8235": 4,
+    "regression_8329": 7,
+    "regression_8519": 677,
+    "regression_8558": 657,
+    "regression_8739": 1,
+    "regression_8761": 2,
+    "regression_8874": 524,
+    "regression_8890": 6,
+    "regression_8926": 532,
+    "regression_8975": 2,
+    "regression_9037": 3,
+    "regression_9047": 521,
+    "regression_9102": 5,
+    "regression_9116": 1,
+    "regression_9160": 3,
+    "regression_9193": 2,
+    "regression_9206": 532,
+    "regression_9243": 1,
+    "regression_9294": 1,
+    "regression_9329": 523,
+    "regression_9546": 529,
+    "regression_9657": 523,
+    "regression_9725_1": 1,
+    "regression_9725_2": 2,
+    "regression_9907": 3,
+    "regression_method_cannot_be_found": 1,
+    "return_twice": 5,
+    "shift_right_overflow": 517,
+    "shl_signed_regression_9661": 520,
+    "signed_bitshift": 1,
+    "signed_overflow_in_else_regression_8617": 660,
+    "signed_truncation": 918,
+    "simple_2d_array": 13,
+    "simple_add_and_ret_arr": 3,
+    "simple_array_param": 4,
+    "simple_bitwise": 1355,
+    "simple_comparison": 784,
+    "simple_mut": 3,
+    "simple_not": 3,
+    "simple_print": 3,
+    "simple_program_addition": 3,
+    "struct": 5,
+    "struct_array_inputs": 8,
+    "struct_fields_ordering": 524,
+    "submodules": 4,
+    "trait_as_return_type": 523,
+    "trait_associated_constant": 1,
+    "trait_impl_base_type": 523,
+    "traits_in_crates_1": 3,
+    "traits_in_crates_2": 3,
+    "tuple_inputs": 845,
+    "tuples": 657,
+    "type_aliases": 5,
+    "unsafe_range_constraint": 527,
+    "unsigned_to_signed_cast": 918,
+    "while_loop_break_regression_8521": 541,
+    "wildcard_type": 7,
+    "witness_compression": 6,
+    "workspace_default_member": 3,
+    "wrapping_operations": 908,
+    "xor": 1367,
+}
+
+
+_MAVROS_STATUS_URL = (
+    "https://raw.githubusercontent.com/reilabs/mavros/main/STATUS.md"
+)
+_EXEC_SUCCESS_PREFIX = "noir/test_programs/execution_success/"
+
+
+def _fetch_live_mavros_cols() -> dict[str, int]:
+    """Parse Mavros Cols from the live reilabs/mavros STATUS.md.
+
+    Returns an empty dict on any network or parse failure so the caller
+    can fall back gracefully to the hardcoded table.
+    """
+    try:
+        with urllib.request.urlopen(_MAVROS_STATUS_URL, timeout=15) as resp:
+            content = resp.read().decode("utf-8")
+    except Exception as exc:
+        print(
+            f"Warning: could not fetch live Mavros STATUS.md ({exc}); "
+            "falling back to hardcoded data.",
+            file=sys.stderr,
+        )
+        return {}
+
+    result: dict[str, int] = {}
+    for line in content.splitlines():
+        if _EXEC_SUCCESS_PREFIX not in line:
+            continue
+        parts = line.split("|")
+        # Table row: | test | Compiled | R1CS | Rows | Cols | ...
+        # After split: parts[1]=test, parts[5]=Cols (1-indexed, 0 is empty)
+        if len(parts) < 6:
+            continue
+        name_field = parts[1].strip()
+        cols_field = parts[5].strip()
+        if _EXEC_SUCCESS_PREFIX not in name_field:
+            continue
+        circuit = name_field.split(_EXEC_SUCCESS_PREFIX, 1)[1].strip()
+        if circuit and cols_field.isdigit():
+            result[circuit] = int(cols_field)
+
+    if result:
+        print(f"Fetched {len(result)} Mavros entries from live STATUS.md.")
+    else:
+        print("Warning: live STATUS.md parsed 0 entries; using hardcoded data.", file=sys.stderr)
+    return result
+
+
+def main(csv_path: Path, out_dir: Path) -> None:
+    # Live data takes precedence; hardcoded fills any gaps.
+    live = _fetch_live_mavros_cols()
+    mavros_cols = {**MAVROS_COLS, **live}
+
+    provekit: dict[str, int] = {}
+    with csv_path.open() as f:
+        for row in csv.DictReader(f):
+            leaf = row["test_name"].split("/")[-1]
+            try:
+                provekit[leaf] = int(row["provekit_witnesses"])
+            except (ValueError, KeyError):
+                continue
+
+    comparable = [
+        (name, mavros, provekit[name])
+        for name, mavros in sorted(mavros_cols.items())
+        if name in provekit
+    ]
+
+    equal = sum(1 for _, m, p in comparable if m == p)
+    mavros_better = sum(1 for _, m, p in comparable if m < p)
+    provekit_better = sum(1 for _, m, p in comparable if p < m)
+
+    lines = [
+        "# Mavros vs Provekit Witnesses Count",
+        "",
+        f"Comparable {len(comparable)} circuits: {equal} equal, "
+        f"{mavros_better} Mavros better, {provekit_better} Provekit better.",
+        "",
+        "| Test | Mavros Cols | Provekit Post-GE | Delta | Better | Factor |",
+        "|------|-------------|------------------|-------|--------|--------|",
+    ]
+
+    for name, mavros, pk in comparable:
+        delta = pk - mavros
+        delta_str = f"+{delta}" if delta > 0 else str(delta)
+        if mavros == pk:
+            better = "equal"
+            factor = "1.00x"
+        elif pk < mavros:
+            better = "provekit"
+            factor = f"{mavros / pk:.2f}x"
+        else:
+            better = "mavros"
+            factor = f"{pk / mavros:.2f}x"
+        lines.append(f"| {name} | {mavros} | {pk} | {delta_str} | {better} | {factor} |")
+
+    out_path = out_dir / "witness_comparison.md"
+    out_path.write_text("\n".join(lines) + "\n")
+    print(f"Wrote {out_path} ({len(comparable)} circuits compared)")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print(f"Usage: {sys.argv[0]} <witness_csv> <output_dir>", file=sys.stderr)
+        sys.exit(1)
+    main(Path(sys.argv[1]), Path(sys.argv[2]))
diff --git a/scripts/run_noir_execution_success.sh b/scripts/run_noir_execution_success.sh
index 09b7deca0..a58c4ff49 100755
--- a/scripts/run_noir_execution_success.sh
+++ b/scripts/run_noir_execution_success.sh
@@ -123,6 +123,8 @@ fi
 
 mkdir -p "${LOG_DIR}/per_test"
 GROUPED_REPORT_FILE="${LOG_DIR}/grouped_error_report.txt"
+WITNESS_CSV="${LOG_DIR}/provekit_witness_counts.csv"
+echo "test_name,provekit_witnesses" > "${WITNESS_CSV}"
 
 shopt -s nullglob globstar
 
@@ -442,6 +444,13 @@ for test_name in "${test_dirs[@]}"; do
   fi
   append_stage_marker "${test_log}" "provekit-cli prepare" "PASS"
 
+  # Extract ProveKit post-GE witness count before the log is deleted on success
+  _ge_line=$(grep -o 'After GE optimization: [0-9]* constraints, [0-9]* witnesses' "${test_log}" | tail -1)
+  _pk_witnesses=$(echo "${_ge_line}" | grep -o '[0-9]* witnesses' | grep -o '^[0-9]*')
+  if [[ -n "${_pk_witnesses}" ]]; then
+    echo "${test_name},${_pk_witnesses}" >> "${WITNESS_CSV}"
+  fi
+
   append_stage_marker "${test_log}" "provekit-cli prove" "START"
   if ! (cd "${workdir}" && "${PROVEKIT_BIN}" prove "./prover.pkp" "./${prover_toml_rel}" -o "./proof.np" >> "${test_log}" 2>&1); then
     append_stage_marker "${test_log}" "provekit-cli prove" "FAIL"
@@ -603,6 +612,20 @@ fi
 
 echo "Grouped report  : ${GROUPED_REPORT_FILE}"
 
+# Generate Mavros vs ProveKit witness comparison table
+if [[ -f "${WITNESS_CSV}" ]] && python3 "${SCRIPT_DIR}/generate_witness_comparison.py" "${WITNESS_CSV}" "${LOG_DIR}"; then
+  echo "Witness comparison: ${LOG_DIR}/witness_comparison.md"
+  if [[ -n "${GITHUB_STEP_SUMMARY:-}" ]]; then
+    {
+      echo ""
+      echo "## Mavros vs ProveKit Witness Count"
+      head -4 "${LOG_DIR}/witness_comparison.md"
+      echo ""
+      echo "_Full table available in artifact: \`witness_comparison.md\`_"
+    } >> "${GITHUB_STEP_SUMMARY}"
+  fi
+fi
+
 if [[ "${failed}" -gt 0 ]]; then
   exit 1
 fi

From 688d761926a6785c417f3b88a446089f38d7844b Mon Sep 17 00:00:00 2001
From: x-senpai-x <sharmautsav0531@gmail.com>
Date: Tue, 21 Apr 2026 11:55:46 +0530
Subject: [PATCH 04/16] made v19 defauilt in vendor script

---
 scripts/run_noir_execution_success.sh    | 10 +++++++---
 scripts/vendor_noir_execution_success.sh |  2 +-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/scripts/run_noir_execution_success.sh b/scripts/run_noir_execution_success.sh
index a58c4ff49..71dc4b34d 100755
--- a/scripts/run_noir_execution_success.sh
+++ b/scripts/run_noir_execution_success.sh
@@ -444,9 +444,13 @@ for test_name in "${test_dirs[@]}"; do
   fi
   append_stage_marker "${test_log}" "provekit-cli prepare" "PASS"
 
-  # Extract ProveKit post-GE witness count before the log is deleted on success
-  _ge_line=$(grep -o 'After GE optimization: [0-9]* constraints, [0-9]* witnesses' "${test_log}" | tail -1)
-  _pk_witnesses=$(echo "${_ge_line}" | grep -o '[0-9]* witnesses' | grep -o '^[0-9]*')
+  # Extract ProveKit post-GE witness count before the log is deleted on success.
+  # Keep this non-fatal under `set -euo pipefail` if the log format changes/misses.
+  _ge_line="$(grep -o 'After GE optimization: [0-9]* constraints, [0-9]* witnesses' "${test_log}" | tail -1 || true)"
+  _pk_witnesses=""
+  if [[ "${_ge_line}" =~ ([0-9]+)\ witnesses$ ]]; then
+    _pk_witnesses="${BASH_REMATCH[1]}"
+  fi
   if [[ -n "${_pk_witnesses}" ]]; then
     echo "${test_name},${_pk_witnesses}" >> "${WITNESS_CSV}"
   fi
diff --git a/scripts/vendor_noir_execution_success.sh b/scripts/vendor_noir_execution_success.sh
index 2fd2dddcc..9bdfae962 100755
--- a/scripts/vendor_noir_execution_success.sh
+++ b/scripts/vendor_noir_execution_success.sh
@@ -5,7 +5,7 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
 DEST_EXEC_DIR="${REPO_ROOT}/test-programs/noir/execution_success"
 DEST_LIB_DIR="${REPO_ROOT}/test-programs/noir/test_libraries"
-NOIR_REF="${NOIR_REF:-master}"
+NOIR_REF="${NOIR_REF:-v1.0.0-beta.19}"
 
 tmpdir="$(mktemp -d)"
 cleanup() {

From 4d24b89bc1a741647af5ec7398dbb5c822048035 Mon Sep 17 00:00:00 2001
From: x-senpai-x <sharmautsav0531@gmail.com>
Date: Tue, 21 Apr 2026 12:55:57 +0530
Subject: [PATCH 05/16] added missing tests in one of them as well

---
 scripts/generate_witness_comparison.py | 76 ++++++++++++++++++++++----
 1 file changed, 66 insertions(+), 10 deletions(-)

diff --git a/scripts/generate_witness_comparison.py b/scripts/generate_witness_comparison.py
index fdbb6ada0..e3619d4ad 100644
--- a/scripts/generate_witness_comparison.py
+++ b/scripts/generate_witness_comparison.py
@@ -217,6 +217,37 @@
 )
 _EXEC_SUCCESS_PREFIX = "noir/test_programs/execution_success/"
 
+# Keep in sync with SKIP_TESTS in scripts/run_noir_execution_success.sh.
+# These are intentionally excluded from both sides of the witness comparison.
+SKIP_TESTS: set[str] = {
+    # BLAKE3
+    "a_6",
+    "array_dynamic_blackbox_input",
+    "array_dynamic_nested_blackbox_input",
+    "blake3",
+    "conditional_1",
+    "conditional_regression_short_circuit",
+    "regression_4449",
+    # ECDSA_SECP256K1
+    "bench_ecdsa_secp256k1",
+    "ecdsa_secp256k1",
+    "ecdsa_secp256k1_invalid_inputs",
+    "ecdsa_secp256k1_invalid_pub_key_in_inactive_branch",
+    # ECDSA_SECP256R1
+    "ecdsa_secp256r1",
+    "ecdsa_secp256r1_3x",
+    "ecdsa_secp256r1_invalid_pub_key_in_inactive_branch",
+    "ecdsa_secp256r1_msg_equals_order",
+    # EMBEDDED_CURVE_ADD
+    "embedded_curve_ops",
+    "regression_5045",
+    "regression_7744",
+    # AES128_ENCRYPT
+    "aes128_encrypt",
+    # BLAKE2S
+    "a_7",
+}
+
 
 def _fetch_live_mavros_cols() -> dict[str, int]:
     """Parse Mavros Cols from the live reilabs/mavros STATUS.md.
@@ -262,22 +293,31 @@ def _fetch_live_mavros_cols() -> dict[str, int]:
 def main(csv_path: Path, out_dir: Path) -> None:
     # Live data takes precedence; hardcoded fills any gaps.
     live = _fetch_live_mavros_cols()
-    mavros_cols = {**MAVROS_COLS, **live}
+    mavros_cols = {
+        name: cols
+        for name, cols in {**MAVROS_COLS, **live}.items()
+        if name not in SKIP_TESTS
+    }
 
     provekit: dict[str, int] = {}
     with csv_path.open() as f:
         for row in csv.DictReader(f):
             leaf = row["test_name"].split("/")[-1]
+            if leaf in SKIP_TESTS:
+                continue
             try:
                 provekit[leaf] = int(row["provekit_witnesses"])
             except (ValueError, KeyError):
                 continue
 
+    all_names = sorted(set(mavros_cols) | set(provekit))
     comparable = [
-        (name, mavros, provekit[name])
-        for name, mavros in sorted(mavros_cols.items())
-        if name in provekit
+        (name, mavros_cols[name], provekit[name])
+        for name in all_names
+        if name in mavros_cols and name in provekit
     ]
+    missing_in_provekit = sum(1 for name in all_names if name in mavros_cols and name not in provekit)
+    missing_in_mavros = sum(1 for name in all_names if name in provekit and name not in mavros_cols)
 
     equal = sum(1 for _, m, p in comparable if m == p)
     mavros_better = sum(1 for _, m, p in comparable if m < p)
@@ -286,14 +326,27 @@ def main(csv_path: Path, out_dir: Path) -> None:
     lines = [
         "# Mavros vs Provekit Witnesses Count",
         "",
-        f"Comparable {len(comparable)} circuits: {equal} equal, "
-        f"{mavros_better} Mavros better, {provekit_better} Provekit better.",
+        f"Union {len(all_names)} circuits: {len(comparable)} comparable, "
+        f"{missing_in_provekit} missing in Provekit, {missing_in_mavros} missing in Mavros.",
+        f"Among comparable: {equal} equal, {mavros_better} Mavros better, "
+        f"{provekit_better} Provekit better.",
         "",
         "| Test | Mavros Cols | Provekit Post-GE | Delta | Better | Factor |",
         "|------|-------------|------------------|-------|--------|--------|",
     ]
 
-    for name, mavros, pk in comparable:
+    for name in all_names:
+        mavros = mavros_cols.get(name)
+        pk = provekit.get(name)
+
+        if mavros is None:
+            lines.append(f"| {name} | - | {pk} | - | missing_mavros | - |")
+            continue
+
+        if pk is None:
+            lines.append(f"| {name} | {mavros} | - | - | missing_provekit | - |")
+            continue
+
         delta = pk - mavros
         delta_str = f"+{delta}" if delta > 0 else str(delta)
         if mavros == pk:
@@ -301,15 +354,18 @@ def main(csv_path: Path, out_dir: Path) -> None:
             factor = "1.00x"
         elif pk < mavros:
             better = "provekit"
-            factor = f"{mavros / pk:.2f}x"
+            factor = "inf" if pk == 0 else f"{mavros / pk:.2f}x"
         else:
             better = "mavros"
-            factor = f"{pk / mavros:.2f}x"
+            factor = "inf" if mavros == 0 else f"{pk / mavros:.2f}x"
         lines.append(f"| {name} | {mavros} | {pk} | {delta_str} | {better} | {factor} |")
 
     out_path = out_dir / "witness_comparison.md"
     out_path.write_text("\n".join(lines) + "\n")
-    print(f"Wrote {out_path} ({len(comparable)} circuits compared)")
+    print(
+        f"Wrote {out_path} "
+        f"({len(all_names)} total circuits, {len(comparable)} comparable)"
+    )
 
 
 if __name__ == "__main__":

From fe608a9ab533ebb8ed0689bcdb8a1f4188d5e885 Mon Sep 17 00:00:00 2001
From: x-senpai-x <sharmautsav0531@gmail.com>
Date: Tue, 21 Apr 2026 23:56:32 +0530
Subject: [PATCH 06/16] refactored shell script

---
 .github/scripts/build_noir_pr_comment.py     | 221 ++++++++++++++++
 .github/workflows/noir-execution-success.yml |  77 +++++-
 scripts/generate_witness_comparison.py       |  36 +--
 scripts/noir_execution_helpers.py            | 258 ++++++++++++++++++
 scripts/noir_skip_tests.txt                  |  38 +++
 scripts/run_noir_execution_success.sh        | 265 ++-----------------
 6 files changed, 623 insertions(+), 272 deletions(-)
 create mode 100644 .github/scripts/build_noir_pr_comment.py
 create mode 100644 scripts/noir_execution_helpers.py
 create mode 100644 scripts/noir_skip_tests.txt

diff --git a/.github/scripts/build_noir_pr_comment.py b/.github/scripts/build_noir_pr_comment.py
new file mode 100644
index 000000000..bd760a354
--- /dev/null
+++ b/.github/scripts/build_noir_pr_comment.py
@@ -0,0 +1,221 @@
+#!/usr/bin/env python3
+"""Build a sticky PR comment for noir execution_success workflow runs."""
+
+from __future__ import annotations
+
+import argparse
+import re
+from pathlib import Path
+
+MARKER = "<!-- noir-execution-success-report -->"
+MAX_COMMENT_CHARS = 62000
+MIN_SECTION_CHARS = 1500
+
+
+def read_report(path: Path, display_name: str) -> str:
+    if not path.is_file():
+        return f"(missing: {display_name})"
+
+    text = path.read_text(encoding="utf-8", errors="replace").strip()
+    if not text:
+        return f"(empty: {display_name})"
+    return text
+
+
+def parse_grouped_counts(grouped_report_text: str) -> dict[str, str]:
+    counts: dict[str, str] = {}
+    for key in ("PASS", "FAIL", "SKIP"):
+        match = re.search(rf"^{key}=(\d+)$", grouped_report_text, flags=re.MULTILINE)
+        counts[key] = match.group(1) if match else "n/a"
+    return counts
+
+
+def status_with_icon(status: str) -> str:
+    normalized = (status or "unknown").strip().lower()
+    labels = {
+        "success": "[PASS]",
+        "failure": "[FAIL]",
+        "cancelled": "[CANCELLED]",
+        "skipped": "[SKIPPED]",
+    }
+    return f"{labels.get(normalized, '[INFO]')} {normalized}"
+
+
+def sanitize_code_fence(text: str) -> str:
+    return text.replace("```", "``\\`")
+
+
+def compose_comment(
+    grouped_report_text: str,
+    witness_report_text: str,
+    grouped_truncated: bool,
+    witness_truncated: bool,
+    run_id: str,
+    run_url: str,
+    sha: str,
+    noir_ref: str,
+    status: str,
+) -> str:
+    counts = parse_grouped_counts(grouped_report_text)
+    short_sha = sha[:12] if sha else "unknown"
+
+    grouped_truncated_note = (
+        "\n_Grouped report truncated to fit GitHub comment size limits._\n"
+        if grouped_truncated
+        else ""
+    )
+    witness_truncated_note = (
+        "\n_Witness comparison truncated to fit GitHub comment size limits._\n"
+        if witness_truncated
+        else ""
+    )
+
+    lines = [
+        MARKER,
+        "## Noir execution_success report",
+        "",
+        "| Metric | Value |",
+        "|--------|-------|",
+        f"| Workflow status | {status_with_icon(status)} |",
+        f"| Noir ref | `{noir_ref}` |",
+        f"| Commit | `{short_sha}` |",
+        f"| Run | [#{run_id}]({run_url}) |",
+        f"| PASS | {counts['PASS']} |",
+        f"| FAIL | {counts['FAIL']} |",
+        f"| SKIP | {counts['SKIP']} |",
+        "",
+        "<details>",
+        "<summary><code>grouped_error_report.txt</code></summary>",
+        "",
+        "```text",
+        sanitize_code_fence(grouped_report_text),
+        "```",
+        grouped_truncated_note,
+        "</details>",
+        "",
+        "<details>",
+        "<summary><code>witness_comparison.md</code></summary>",
+        "",
+        witness_report_text,
+        witness_truncated_note,
+        "</details>",
+        "",
+        "_This comment is automatically updated by the Noir Execution Success workflow._",
+        "",
+    ]
+
+    return "\n".join(lines)
+
+
+def clip_tail(text: str, min_chars: int, excess: int, label: str) -> tuple[str, bool]:
+    if len(text) <= min_chars or excess <= 0:
+        return text, False
+
+    reduction = min(len(text) - min_chars, excess + 1024)
+    kept = text[: len(text) - reduction].rstrip()
+    omitted = len(text) - len(kept)
+    clipped = f"{kept}\n\n[... truncated {omitted} characters from {label} ...]"
+    return clipped, True
+
+
+def build_with_truncation(
+    grouped_report_text: str,
+    witness_report_text: str,
+    run_id: str,
+    run_url: str,
+    sha: str,
+    noir_ref: str,
+    status: str,
+) -> str:
+    grouped_work = grouped_report_text
+    witness_work = witness_report_text
+    grouped_truncated = False
+    witness_truncated = False
+
+    for _ in range(128):
+        comment = compose_comment(
+            grouped_work,
+            witness_work,
+            grouped_truncated=grouped_truncated,
+            witness_truncated=witness_truncated,
+            run_id=run_id,
+            run_url=run_url,
+            sha=sha,
+            noir_ref=noir_ref,
+            status=status,
+        )
+        if len(comment) <= MAX_COMMENT_CHARS:
+            return comment
+
+        excess = len(comment) - MAX_COMMENT_CHARS
+        witness_work, witness_changed = clip_tail(
+            witness_work, MIN_SECTION_CHARS, excess, "witness_comparison.md"
+        )
+        witness_truncated = witness_truncated or witness_changed
+        if witness_changed:
+            continue
+
+        grouped_work, grouped_changed = clip_tail(
+            grouped_work, MIN_SECTION_CHARS, excess, "grouped_error_report.txt"
+        )
+        grouped_truncated = grouped_truncated or grouped_changed
+        if grouped_changed:
+            continue
+
+        break
+
+    # Final hard guard if both reports are already near minimum length.
+    fallback = compose_comment(
+        grouped_work,
+        witness_work,
+        grouped_truncated=True,
+        witness_truncated=True,
+        run_id=run_id,
+        run_url=run_url,
+        sha=sha,
+        noir_ref=noir_ref,
+        status=status,
+    )
+    if len(fallback) <= MAX_COMMENT_CHARS:
+        return fallback
+
+    hard_cut = fallback[: MAX_COMMENT_CHARS - 120].rstrip()
+    return f"{hard_cut}\n\n_Comment truncated due to GitHub size limits._\n"
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--grouped-report", required=True, type=Path)
+    parser.add_argument("--witness-report", required=True, type=Path)
+    parser.add_argument("--output", required=True, type=Path)
+    parser.add_argument("--run-id", required=True)
+    parser.add_argument("--run-url", required=True)
+    parser.add_argument("--sha", required=True)
+    parser.add_argument("--noir-ref", required=True)
+    parser.add_argument("--status", required=True)
+    return parser.parse_args()
+
+
+def main() -> None:
+    args = parse_args()
+
+    grouped_report_text = read_report(args.grouped_report, "grouped_error_report.txt")
+    witness_report_text = read_report(args.witness_report, "witness_comparison.md")
+
+    body = build_with_truncation(
+        grouped_report_text=grouped_report_text,
+        witness_report_text=witness_report_text,
+        run_id=args.run_id,
+        run_url=args.run_url,
+        sha=args.sha,
+        noir_ref=args.noir_ref,
+        status=args.status,
+    )
+
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    args.output.write_text(body, encoding="utf-8")
+    print(f"Wrote PR comment body to {args.output} ({len(body)} chars)")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/workflows/noir-execution-success.yml b/.github/workflows/noir-execution-success.yml
index 3e6cd267b..5def5ac96 100644
--- a/.github/workflows/noir-execution-success.yml
+++ b/.github/workflows/noir-execution-success.yml
@@ -2,6 +2,7 @@ name: Noir Execution Success Tests
 
 # Provide a noir_ref to test against any Noir release.
 on:
+  pull_request:
   workflow_dispatch:
     inputs:
       noir_ref:
@@ -9,9 +10,13 @@ on:
         required: false
         default: "v1.0.0-beta.19"
 
+permissions:
+  contents: read
+  pull-requests: write
+
 env:
   CARGO_TERM_COLOR: always
-  NOIR_REF: ${{ inputs.noir_ref || 'v1.0.0-beta.19' }}
+  NOIR_REF: ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.noir_ref != '' && github.event.inputs.noir_ref || 'v1.0.0-beta.19') || 'v1.0.0-beta.19' }}
 
 # Cancel any in-progress run on the same branch when a new one is triggered.
 concurrency:
@@ -20,7 +25,7 @@ concurrency:
 
 jobs:
   noir-execution-success:
-    name: Noir execution_success suite (${{ inputs.noir_ref || 'v1.0.0-beta.19' }})
+    name: Noir execution_success suite (${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.noir_ref != '' && github.event.inputs.noir_ref || 'v1.0.0-beta.19') || 'v1.0.0-beta.19' }})
     runs-on: [self-hosted, Linux, ARM64, provekit-build]
 
     steps:
@@ -75,6 +80,74 @@ jobs:
           path: noir-execution-logs/
           retention-days: 7
 
+      - name: Build sticky PR comment body
+        if: always() && github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository
+        continue-on-error: true
+        run: |
+          python3 .github/scripts/build_noir_pr_comment.py \
+            --grouped-report "noir-execution-logs/grouped_error_report.txt" \
+            --witness-report "noir-execution-logs/witness_comparison.md" \
+            --output "noir-execution-logs/pr_comment.md" \
+            --run-id "${{ github.run_id }}" \
+            --run-url "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" \
+            --sha "${{ github.sha }}" \
+            --noir-ref "${{ env.NOIR_REF }}" \
+            --status "${{ job.status }}"
+
+      - name: Upsert sticky PR report comment
+        if: always() && github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository
+        continue-on-error: true
+        uses: actions/github-script@v7
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            const fs = require('fs');
+            const marker = '<!-- noir-execution-success-report -->';
+            const bodyPath = 'noir-execution-logs/pr_comment.md';
+            const fallbackBody = [
+              marker,
+              '## Noir execution_success report',
+              '',
+              'Unable to generate the detailed report body for this run.',
+              '',
+              'Run: [#${{ github.run_id }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})',
+            ].join('\n');
+            const body = fs.existsSync(bodyPath) ? fs.readFileSync(bodyPath, 'utf8') : fallbackBody;
+
+            const { owner, repo } = context.repo;
+            const issue_number = context.issue.number;
+            const comments = await github.paginate(github.rest.issues.listComments, {
+              owner,
+              repo,
+              issue_number,
+              per_page: 100,
+            });
+
+            const existing = comments.find((comment) =>
+              comment.user &&
+              comment.user.login === 'github-actions[bot]' &&
+              comment.body &&
+              comment.body.includes(marker)
+            );
+
+            if (existing) {
+              await github.rest.issues.updateComment({
+                owner,
+                repo,
+                comment_id: existing.id,
+                body,
+              });
+              core.info(`Updated existing noir sticky comment (id=${existing.id}).`);
+            } else {
+              const created = await github.rest.issues.createComment({
+                owner,
+                repo,
+                issue_number,
+                body,
+              });
+              core.info(`Created new noir sticky comment (id=${created.data.id}).`);
+            }
+
       # Always clean up the temp clone, even if the test step failed.
       - name: Cleanup noir clone
         if: always()
diff --git a/scripts/generate_witness_comparison.py b/scripts/generate_witness_comparison.py
index e3619d4ad..0eda181e8 100644
--- a/scripts/generate_witness_comparison.py
+++ b/scripts/generate_witness_comparison.py
@@ -14,6 +14,8 @@
 import urllib.request
 from pathlib import Path
 
+from noir_execution_helpers import load_skip_tests
+
 # Mavros Cols — extracted from reilabs/mavros STATUS.md (column "Cols"),
 # noir/test_programs/execution_success/* rows only. Keys are bare circuit names.
 MAVROS_COLS: dict[str, int] = {
@@ -217,36 +219,10 @@
 )
 _EXEC_SUCCESS_PREFIX = "noir/test_programs/execution_success/"
 
-# Keep in sync with SKIP_TESTS in scripts/run_noir_execution_success.sh.
-# These are intentionally excluded from both sides of the witness comparison.
-SKIP_TESTS: set[str] = {
-    # BLAKE3
-    "a_6",
-    "array_dynamic_blackbox_input",
-    "array_dynamic_nested_blackbox_input",
-    "blake3",
-    "conditional_1",
-    "conditional_regression_short_circuit",
-    "regression_4449",
-    # ECDSA_SECP256K1
-    "bench_ecdsa_secp256k1",
-    "ecdsa_secp256k1",
-    "ecdsa_secp256k1_invalid_inputs",
-    "ecdsa_secp256k1_invalid_pub_key_in_inactive_branch",
-    # ECDSA_SECP256R1
-    "ecdsa_secp256r1",
-    "ecdsa_secp256r1_3x",
-    "ecdsa_secp256r1_invalid_pub_key_in_inactive_branch",
-    "ecdsa_secp256r1_msg_equals_order",
-    # EMBEDDED_CURVE_ADD
-    "embedded_curve_ops",
-    "regression_5045",
-    "regression_7744",
-    # AES128_ENCRYPT
-    "aes128_encrypt",
-    # BLAKE2S
-    "a_7",
-}
+# Skip list is shared with scripts/run_noir_execution_success.sh via
+# scripts/noir_skip_tests.txt; these circuits are excluded from both sides
+# of the witness comparison.
+SKIP_TESTS: set[str] = load_skip_tests()
 
 
 def _fetch_live_mavros_cols() -> dict[str, int]:
diff --git a/scripts/noir_execution_helpers.py b/scripts/noir_execution_helpers.py
new file mode 100644
index 000000000..c18288a88
--- /dev/null
+++ b/scripts/noir_execution_helpers.py
@@ -0,0 +1,258 @@
+#!/usr/bin/env python3
+"""Helpers for scripts/run_noir_execution_success.sh.
+
+Subcommands:
+  discover <test_root>                      — list runnable test dirs
+  resolve-prover-toml <project_dir> <name>  — find Prover.toml for a package
+  package-name <project_dir>                — read [package].name from Nargo.toml
+  build-report <log_dir> <passed_count>     — write grouped_error_report.txt
+  skip-tests                                — print the skip list (one per line)
+
+The skip list lives in scripts/noir_skip_tests.txt and is the single source
+of truth shared with scripts/generate_witness_comparison.py.
+"""
+
+from __future__ import annotations
+
+import argparse
+import os
+import re
+import sys
+import tomllib
+from collections import defaultdict
+from pathlib import Path
+
+SKIP_LIST_FILE = Path(__file__).with_name("noir_skip_tests.txt")
+
+
+def load_skip_tests() -> set[str]:
+    """Return the skip list parsed from noir_skip_tests.txt.
+
+    Blank lines and lines starting with `#` are ignored. Inline `#` comments
+    are stripped. Returns an empty set if the file is missing.
+    """
+    if not SKIP_LIST_FILE.is_file():
+        return set()
+    names: set[str] = set()
+    for raw in SKIP_LIST_FILE.read_text().splitlines():
+        line = raw.split("#", 1)[0].strip()
+        if line:
+            names.add(line)
+    return names
+
+
+def discover_tests(root: Path) -> list[str]:
+    """Return candidate test project paths relative to ``root``.
+
+    Mirrors the legacy shell heredoc: a path is a candidate if it is a
+    workspace default-member, or if it has both a `[package]` entry in its
+    Nargo.toml and a sibling Prover.toml. Nested projects under a workspace
+    default-member are suppressed.
+    """
+    nargo_data: dict[str, dict] = {}
+    for nargo in root.rglob("Nargo.toml"):
+        rel = nargo.parent.relative_to(root).as_posix()
+        try:
+            data = tomllib.loads(nargo.read_text())
+        except Exception:
+            data = {}
+        nargo_data[rel] = data
+
+    workspace_default_roots: set[str] = set()
+    for rel, data in nargo_data.items():
+        ws = data.get("workspace")
+        if isinstance(ws, dict) and "default-member" in ws:
+            workspace_default_roots.add(rel)
+
+    suppressed: set[str] = set()
+    for ws_rel in workspace_default_roots:
+        ws_path = Path(ws_rel) if ws_rel != "." else Path()
+        for rel in nargo_data:
+            rel_path = Path(rel) if rel != "." else Path()
+            if rel_path != ws_path and ws_path in rel_path.parents:
+                suppressed.add(rel)
+
+    candidates: set[str] = set(workspace_default_roots)
+    for rel, data in nargo_data.items():
+        if rel in suppressed:
+            continue
+        pkg = data.get("package")
+        if isinstance(pkg, dict) and "name" in pkg:
+            if (root / rel / "Prover.toml").is_file():
+                candidates.add(rel)
+
+    return sorted(candidates)
+
+
+def resolve_prover_toml(project_dir: Path, package_name: str) -> str:
+    """Return Prover.toml path (relative to ``project_dir``) for ``package_name``.
+
+    Prefers a Prover.toml located next to the Nargo.toml whose package name
+    matches. Falls back to a root-level Prover.toml, then to the sole
+    Prover.toml under the project when unambiguous. Returns "" otherwise.
+    """
+    matches: list[str] = []
+    for nargo in sorted(project_dir.rglob("Nargo.toml")):
+        try:
+            data = tomllib.loads(nargo.read_text())
+        except Exception:
+            continue
+        pkg = data.get("package")
+        if not isinstance(pkg, dict) or pkg.get("name") != package_name:
+            continue
+        prover = nargo.parent / "Prover.toml"
+        if prover.is_file():
+            matches.append(prover.relative_to(project_dir).as_posix())
+
+    if matches:
+        matches.sort(key=lambda p: (p.count("/"), p))
+        return matches[0]
+
+    root_prover = project_dir / "Prover.toml"
+    if root_prover.is_file():
+        return "Prover.toml"
+
+    all_provers = sorted(project_dir.rglob("Prover.toml"))
+    if len(all_provers) == 1:
+        return all_provers[0].relative_to(project_dir).as_posix()
+
+    return ""
+
+
+def read_package_name(project_dir: Path) -> str:
+    """Return [package].name from ``project_dir/Nargo.toml`` or ""."""
+    nargo = project_dir / "Nargo.toml"
+    if not nargo.is_file():
+        return ""
+    try:
+        data = tomllib.loads(nargo.read_text())
+    except Exception:
+        return ""
+    pkg = data.get("package")
+    if isinstance(pkg, dict):
+        return str(pkg.get("name", ""))
+    return ""
+
+
+_BLACKBOX_RE = re.compile(
+    r"not implemented: Other black box function: BLACKBOX::([A-Z0-9_]+)"
+)
+_PANIC_RE = re.compile(r"panicked at [^\n]*:\n([^\n]+)")
+_SOLVE_RE = re.compile(r"Failed to solve program: '([^']+)'")
+_COMPILE_ERR_RE = re.compile(r"^error:\s*([^\n]+)", flags=re.M)
+_COMPILE_BUG_RE = re.compile(r"^bug:\s*([^\n]+)", flags=re.M)
+_GENERIC_ERR_RE = re.compile(r"^Error:\s*([^\n]+)", flags=re.M)
+_FAIL_STAGE_RE = re.compile(r"FAIL: ([^\n]+)")
+_SKIP_REASON_RE = re.compile(r"SKIP: ([^\n]+)")
+
+
+def _classify_failure(text: str, stage: str) -> str:
+    blackbox = _BLACKBOX_RE.search(text)
+    if blackbox:
+        return f"Not implemented blackbox: {blackbox.group(1)} ({stage})"
+    if "Program must have one entry point." in text:
+        return f"Program must have one entry point ({stage})"
+    panic = _PANIC_RE.search(text)
+    if panic:
+        return f"Panic: {panic.group(1).strip()} ({stage})"
+    solve = _SOLVE_RE.search(text)
+    if solve:
+        return f"Failed to solve program: {solve.group(1)} ({stage})"
+    if "Failed assertion" in text:
+        return f"Failed assertion ({stage})"
+    compile_error = _COMPILE_ERR_RE.search(text)
+    if compile_error:
+        return f"Compile error: {compile_error.group(1).strip()} ({stage})"
+    compile_bug = _COMPILE_BUG_RE.search(text)
+    if compile_bug:
+        return f"Compile bug: {compile_bug.group(1).strip()} ({stage})"
+    generic = _GENERIC_ERR_RE.search(text)
+    if generic:
+        return f"Error: {generic.group(1).strip()} ({stage})"
+    return f"Unknown failure ({stage})"
+
+
+def build_grouped_report(log_dir: Path, passed_count: int) -> None:
+    """Scan ``log_dir/per_test/*.log`` and write ``log_dir/grouped_error_report.txt``.
+
+    PASS logs are deleted by the shell runner after each successful test, so
+    the PASS count is threaded in as ``passed_count`` rather than inferred.
+    """
+    per_test_dir = log_dir / "per_test"
+    report_file = log_dir / "grouped_error_report.txt"
+
+    logs = sorted(per_test_dir.glob("*.log"))
+    status_counts = {"PASS": passed_count, "FAIL": 0, "SKIP": 0}
+    grouped: dict[str, list[str]] = defaultdict(list)
+    stage_groups: dict[str, list[str]] = defaultdict(list)
+
+    for fp in logs:
+        text = fp.read_text(errors="replace")
+        name = fp.stem
+
+        if "SKIP:" in text:
+            status_counts["SKIP"] += 1
+            skip_match = _SKIP_REASON_RE.search(text)
+            reason = skip_match.group(1).strip() if skip_match else "unknown"
+            grouped[f"SKIP: {reason}"].append(name)
+            continue
+
+        status_counts["FAIL"] += 1
+        fail_stages = _FAIL_STAGE_RE.findall(text)
+        stage = fail_stages[-1].strip() if fail_stages else "unknown stage"
+        stage_groups[stage].append(name)
+        grouped[_classify_failure(text, stage)].append(name)
+
+    with report_file.open("w") as f:
+        f.write(f"logs={len(logs)}\n")
+        f.write(f"PASS={status_counts['PASS']}\n")
+        f.write(f"FAIL={status_counts['FAIL']}\n")
+        f.write(f"SKIP={status_counts['SKIP']}\n")
+        f.write("\n[stages]\n")
+        for stage, tests in sorted(stage_groups.items(), key=lambda kv: (-len(kv[1]), kv[0])):
+            f.write(f"{stage}\t{len(tests)}\t{', '.join(tests)}\n")
+        f.write("\n[grouped]\n")
+        for key, tests in sorted(grouped.items(), key=lambda kv: (-len(kv[1]), kv[0])):
+            f.write(f"{len(tests)}\t{key}\t{', '.join(tests)}\n")
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    sub = parser.add_subparsers(dest="cmd", required=True)
+
+    p = sub.add_parser("discover", help="list runnable test dirs under <test_root>")
+    p.add_argument("test_root", type=Path)
+
+    p = sub.add_parser("resolve-prover-toml")
+    p.add_argument("project_dir", type=Path)
+    p.add_argument("package_name")
+
+    p = sub.add_parser("package-name")
+    p.add_argument("project_dir", type=Path)
+
+    p = sub.add_parser("build-report")
+    p.add_argument("log_dir", type=Path)
+    p.add_argument("passed_count", type=int)
+
+    sub.add_parser("skip-tests", help="print the skip list, one name per line")
+
+    args = parser.parse_args()
+
+    if args.cmd == "discover":
+        for name in discover_tests(args.test_root):
+            print(name)
+    elif args.cmd == "resolve-prover-toml":
+        print(resolve_prover_toml(args.project_dir, args.package_name))
+    elif args.cmd == "package-name":
+        print(read_package_name(args.project_dir))
+    elif args.cmd == "build-report":
+        build_grouped_report(args.log_dir, args.passed_count)
+    elif args.cmd == "skip-tests":
+        for name in sorted(load_skip_tests()):
+            print(name)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/scripts/noir_skip_tests.txt b/scripts/noir_skip_tests.txt
new file mode 100644
index 000000000..31b26c337
--- /dev/null
+++ b/scripts/noir_skip_tests.txt
@@ -0,0 +1,38 @@
+# Tests that use blackbox functions not yet supported by provekit.
+# Counted as SKIP (not FAIL) by scripts/run_noir_execution_success.sh
+# and excluded from scripts/generate_witness_comparison.py.
+# Remove entries here once the corresponding blackbox is supported.
+#
+# Format: one bare test name per line. Blank lines and `#` comments are ignored.
+
+# BLAKE3
+a_6
+array_dynamic_blackbox_input
+array_dynamic_nested_blackbox_input
+blake3
+conditional_1
+conditional_regression_short_circuit
+regression_4449
+
+# ECDSA_SECP256K1
+bench_ecdsa_secp256k1
+ecdsa_secp256k1
+ecdsa_secp256k1_invalid_inputs
+ecdsa_secp256k1_invalid_pub_key_in_inactive_branch
+
+# ECDSA_SECP256R1
+ecdsa_secp256r1
+ecdsa_secp256r1_3x
+ecdsa_secp256r1_invalid_pub_key_in_inactive_branch
+ecdsa_secp256r1_msg_equals_order
+
+# EMBEDDED_CURVE_ADD
+embedded_curve_ops
+regression_5045
+regression_7744
+
+# AES128_ENCRYPT
+aes128_encrypt
+
+# BLAKE2S
+a_7
diff --git a/scripts/run_noir_execution_success.sh b/scripts/run_noir_execution_success.sh
index 71dc4b34d..930d606fb 100755
--- a/scripts/run_noir_execution_success.sh
+++ b/scripts/run_noir_execution_success.sh
@@ -20,6 +20,8 @@ set -euo pipefail
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+HELPER="${SCRIPT_DIR}/noir_execution_helpers.py"
+SKIP_LIST_FILE="${SCRIPT_DIR}/noir_skip_tests.txt"
 
 # ---------------------------------------------------------------------------
 # Resolve test corpus root (CI clone vs. local vendored copy)
@@ -47,43 +49,24 @@ fi
 
 # ---------------------------------------------------------------------------
 # Unimplemented-blackbox skip list
-# These tests use blackbox functions not yet supported by provekit.
-# They are counted as SKIP (not FAIL) and will be added back once supported.
+# Single source of truth: scripts/noir_skip_tests.txt (shared with
+# scripts/generate_witness_comparison.py). Counted as SKIP (not FAIL).
 # ---------------------------------------------------------------------------
-SKIP_TESTS=(
-  # BLAKE3
-  a_6
-  array_dynamic_blackbox_input
-  array_dynamic_nested_blackbox_input
-  blake3
-  conditional_1
-  conditional_regression_short_circuit
-  regression_4449
-  # ECDSA_SECP256K1
-  bench_ecdsa_secp256k1
-  ecdsa_secp256k1
-  ecdsa_secp256k1_invalid_inputs
-  ecdsa_secp256k1_invalid_pub_key_in_inactive_branch
-  # ECDSA_SECP256R1
-  ecdsa_secp256r1
-  ecdsa_secp256r1_3x
-  ecdsa_secp256r1_invalid_pub_key_in_inactive_branch
-  ecdsa_secp256r1_msg_equals_order
-  # EMBEDDED_CURVE_ADD
-  embedded_curve_ops
-  regression_5045
-  regression_7744
-  # AES128_ENCRYPT
-  aes128_encrypt
-  # BLAKE2S
-  a_7
-)
-
-# Build a fast associative-array lookup
+SKIP_TESTS=()
 declare -A SKIP_SET
-for _t in "${SKIP_TESTS[@]}"; do
-  SKIP_SET["${_t}"]=1
-done
+if [[ -f "${SKIP_LIST_FILE}" ]]; then
+  while IFS= read -r _raw || [[ -n "${_raw}" ]]; do
+    _name="${_raw%%#*}"
+    _name="${_name#"${_name%%[![:space:]]*}"}"
+    _name="${_name%"${_name##*[![:space:]]}"}"
+    if [[ -n "${_name}" ]]; then
+      SKIP_TESTS+=("${_name}")
+      SKIP_SET["${_name}"]=1
+    fi
+  done < "${SKIP_LIST_FILE}"
+else
+  echo "WARNING: skip list ${SKIP_LIST_FILE} not found; no tests will be skipped." >&2
+fi
 
 if [[ ! -d "${TEST_ROOT}" ]]; then
   echo "ERROR: Missing test corpus at ${TEST_ROOT}"
@@ -128,134 +111,22 @@ echo "test_name,provekit_witnesses" > "${WITNESS_CSV}"
 
 shopt -s nullglob globstar
 
+# Python helpers live in scripts/noir_execution_helpers.py; these are thin
+# shell wrappers so the main loop reads naturally.
 discover_test_dirs() {
-  TEST_ROOT="${TEST_ROOT}" python3 - <<'PY'
-from pathlib import Path
-import tomllib
-import os
-
-root = Path(os.environ["TEST_ROOT"])
-nargo_data = {}
-
-for nargo in root.rglob("Nargo.toml"):
-    rel = nargo.parent.relative_to(root).as_posix()
-    try:
-        data = tomllib.loads(nargo.read_text())
-    except Exception:
-        data = {}
-    nargo_data[rel] = data
-
-workspace_default_roots = set()
-for rel, data in nargo_data.items():
-    ws = data.get("workspace")
-    if isinstance(ws, dict) and "default-member" in ws:
-        workspace_default_roots.add(rel)
-
-suppressed = set()
-for ws_rel in workspace_default_roots:
-    ws_path = Path(ws_rel) if ws_rel != "." else Path()
-    for rel in nargo_data:
-        rel_path = Path(rel) if rel != "." else Path()
-        if rel_path != ws_path and ws_path in rel_path.parents:
-            suppressed.add(rel)
-
-candidates = set(workspace_default_roots)
-for rel, data in nargo_data.items():
-    if rel in suppressed:
-        continue
-
-    pkg = data.get("package")
-    if isinstance(pkg, dict) and "name" in pkg:
-        if (root / rel / "Prover.toml").is_file():
-            candidates.add(rel)
-
-for rel in sorted(candidates):
-    print(rel)
-PY
+  python3 "${HELPER}" discover "${TEST_ROOT}"
 }
 
 resolve_prover_toml() {
-  local project_dir="$1"
-  local package_name="$2"
-
-  PROJECT_DIR="${project_dir}" PACKAGE_NAME="${package_name}" python3 - <<'PY'
-from pathlib import Path
-import tomllib
-import os
-
-project_dir = Path(os.environ["PROJECT_DIR"])
-package_name = os.environ["PACKAGE_NAME"]
-
-candidates = []
-for nargo in sorted(project_dir.rglob("Nargo.toml")):
-    try:
-        data = tomllib.loads(nargo.read_text())
-    except Exception:
-        continue
-
-    pkg = data.get("package")
-    if not isinstance(pkg, dict):
-        continue
-
-    if pkg.get("name") != package_name:
-        continue
-
-    prover = nargo.parent / "Prover.toml"
-    if prover.is_file():
-        candidates.append(prover.relative_to(project_dir).as_posix())
-
-if candidates:
-    candidates.sort(key=lambda p: (p.count("/"), p))
-    print(candidates[0])
-    raise SystemExit(0)
-
-root_prover = project_dir / "Prover.toml"
-if root_prover.is_file():
-    print("Prover.toml")
-    raise SystemExit(0)
-
-all_provers = sorted(project_dir.rglob("Prover.toml"))
-if len(all_provers) == 1:
-    print(all_provers[0].relative_to(project_dir).as_posix())
-    raise SystemExit(0)
-
-print("")
-PY
+  python3 "${HELPER}" resolve-prover-toml "$1" "$2"
 }
 
 read_workdir_package_name() {
-  local project_dir="$1"
-  PROJECT_DIR="${project_dir}" python3 - <<'PY'
-from pathlib import Path
-import tomllib
-import os
-
-nargo = Path(os.environ["PROJECT_DIR"]) / "Nargo.toml"
-if not nargo.is_file():
-    print("")
-    raise SystemExit(0)
-
-try:
-    data = tomllib.loads(nargo.read_text())
-except Exception:
-    print("")
-    raise SystemExit(0)
-
-pkg = data.get("package")
-if isinstance(pkg, dict):
-    print(pkg.get("name", ""))
-else:
-    print("")
-PY
+  python3 "${HELPER}" package-name "$1"
 }
 
 relative_path() {
-  local from_dir="$1"
-  local to_path="$2"
-  FROM_DIR="${from_dir}" TO_PATH="${to_path}" python3 - <<'PY'
-import os
-print(os.path.relpath(os.environ["TO_PATH"], os.environ["FROM_DIR"]))
-PY
+  python3 -c 'import os, sys; print(os.path.relpath(sys.argv[2], sys.argv[1]))' "$1" "$2"
 }
 
 
@@ -503,93 +374,7 @@ echo "Failed           : ${failed}"
 echo "Skipped          : ${skipped}  (${#SKIP_TESTS[@]} unimplemented-blackbox tests)"
 echo "Log directory    : ${LOG_DIR}"
 
-LOG_DIR="${LOG_DIR}" PASSED_COUNT="${passed}" python3 - <<'PY'
-from pathlib import Path
-import re
-from collections import defaultdict
-import os
-
-log_dir = Path(os.environ["LOG_DIR"])
-per_test_dir = log_dir / "per_test"
-report_file = log_dir / "grouped_error_report.txt"
-
-logs = sorted(per_test_dir.glob("*.log"))
-# PASS logs are deleted after each successful test run; read the count from the shell instead.
-status_counts = {"PASS": int(os.environ.get("PASSED_COUNT", "0")), "FAIL": 0, "SKIP": 0}
-grouped = defaultdict(list)
-stage_groups = defaultdict(list)
-
-for fp in logs:
-    text = fp.read_text(errors="replace")
-    name = fp.stem
-
-    if "SKIP:" in text:
-        status_counts["SKIP"] += 1
-        skip_reason = re.search(r"SKIP: ([^\n]+)", text)
-        reason = skip_reason.group(1).strip() if skip_reason else "unknown"
-        grouped[f"SKIP: {reason}"].append(name)
-        continue
-
-    status_counts["FAIL"] += 1
-    fail_stage_match = re.findall(r"FAIL: ([^\n]+)", text)
-    stage = fail_stage_match[-1].strip() if fail_stage_match else "unknown stage"
-    stage_groups[stage].append(name)
-
-    blackbox = re.search(r"not implemented: Other black box function: BLACKBOX::([A-Z0-9_]+)", text)
-    if blackbox:
-        grouped[f"Not implemented blackbox: {blackbox.group(1)} ({stage})"].append(name)
-        continue
-
-    if "Program must have one entry point." in text:
-        grouped[f"Program must have one entry point ({stage})"].append(name)
-        continue
-
-    panic = re.search(r"panicked at [^\n]*:\n([^\n]+)", text)
-    if panic:
-        grouped[f"Panic: {panic.group(1).strip()} ({stage})"].append(name)
-        continue
-
-    solve = re.search(r"Failed to solve program: '([^']+)'", text)
-    if solve:
-        grouped[f"Failed to solve program: {solve.group(1)} ({stage})"].append(name)
-        continue
-
-    assertion = re.search(r"Failed assertion", text)
-    if assertion:
-        grouped[f"Failed assertion ({stage})"].append(name)
-        continue
-
-    compile_error = re.search(r"^error:\s*([^\n]+)", text, flags=re.M)
-    if compile_error:
-        grouped[f"Compile error: {compile_error.group(1).strip()} ({stage})"].append(name)
-        continue
-
-    compile_bug = re.search(r"^bug:\s*([^\n]+)", text, flags=re.M)
-    if compile_bug:
-        grouped[f"Compile bug: {compile_bug.group(1).strip()} ({stage})"].append(name)
-        continue
-
-    generic_error = re.search(r"^Error:\s*([^\n]+)", text, flags=re.M)
-    if generic_error:
-        grouped[f"Error: {generic_error.group(1).strip()} ({stage})"].append(name)
-        continue
-
-    grouped[f"Unknown failure ({stage})"].append(name)
-
-with report_file.open("w") as f:
-    f.write(f"logs={len(logs)}\n")
-    f.write(f"PASS={status_counts['PASS']}\n")
-    f.write(f"FAIL={status_counts['FAIL']}\n")
-    f.write(f"SKIP={status_counts['SKIP']}\n")
-    f.write("\n[stages]\n")
-    for stage, tests in sorted(stage_groups.items(), key=lambda kv: (-len(kv[1]), kv[0])):
-        f.write(f"{stage}\t{len(tests)}\t{', '.join(tests)}")
-        f.write("\n")
-    f.write("\n[grouped]\n")
-    for key, tests in sorted(grouped.items(), key=lambda kv: (-len(kv[1]), kv[0])):
-        f.write(f"{len(tests)}\t{key}\t{', '.join(tests)}")
-        f.write("\n")
-PY
+python3 "${HELPER}" build-report "${LOG_DIR}" "${passed}"
 
 # Emit GitHub Step Summary when running inside Actions
 # (must be after the Python report generator so grouped_error_report.txt exists)

From 65fd51a291f2446eb6e84eec97bcd4efe1091f06 Mon Sep 17 00:00:00 2001
From: x-senpai-x <sharmautsav0531@gmail.com>
Date: Wed, 22 Apr 2026 00:34:00 +0530
Subject: [PATCH 07/16] minor nits

---
 .github/workflows/noir-execution-success.yml |  3 +++
 scripts/generate_witness_comparison.py       |  8 +++----
 scripts/noir_execution_helpers.py            | 25 ++++++++++++--------
 scripts/run_noir_execution_success.sh        | 23 ++++++++++--------
 4 files changed, 35 insertions(+), 24 deletions(-)

diff --git a/.github/workflows/noir-execution-success.yml b/.github/workflows/noir-execution-success.yml
index 5def5ac96..999247592 100644
--- a/.github/workflows/noir-execution-success.yml
+++ b/.github/workflows/noir-execution-success.yml
@@ -13,6 +13,7 @@ on:
 permissions:
   contents: read
   pull-requests: write
+  issues: write
 
 env:
   CARGO_TERM_COLOR: always
@@ -25,6 +26,8 @@ concurrency:
 
 jobs:
   noir-execution-success:
+    # Block fork PRs from executing arbitrary build scripts on the self-hosted runner.
+    if: ${{ github.event_name == 'workflow_dispatch' || github.event.pull_request.head.repo.full_name == github.repository }}
     name: Noir execution_success suite (${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.noir_ref != '' && github.event.inputs.noir_ref || 'v1.0.0-beta.19') || 'v1.0.0-beta.19' }})
     runs-on: [self-hosted, Linux, ARM64, provekit-build]
 
diff --git a/scripts/generate_witness_comparison.py b/scripts/generate_witness_comparison.py
index 0eda181e8..75f16dfc9 100644
--- a/scripts/generate_witness_comparison.py
+++ b/scripts/generate_witness_comparison.py
@@ -300,14 +300,14 @@ def main(csv_path: Path, out_dir: Path) -> None:
     provekit_better = sum(1 for _, m, p in comparable if p < m)
 
     lines = [
-        "# Mavros vs Provekit Witnesses Count",
+        "# Mavros vs ProveKit Witnesses Count",
         "",
         f"Union {len(all_names)} circuits: {len(comparable)} comparable, "
-        f"{missing_in_provekit} missing in Provekit, {missing_in_mavros} missing in Mavros.",
+        f"{missing_in_provekit} missing in ProveKit, {missing_in_mavros} missing in Mavros.",
         f"Among comparable: {equal} equal, {mavros_better} Mavros better, "
-        f"{provekit_better} Provekit better.",
+        f"{provekit_better} ProveKit better.",
         "",
-        "| Test | Mavros Cols | Provekit Post-GE | Delta | Better | Factor |",
+        "| Test | Mavros Cols | ProveKit Post-GE | Delta | Better | Factor |",
         "|------|-------------|------------------|-------|--------|--------|",
     ]
 
diff --git a/scripts/noir_execution_helpers.py b/scripts/noir_execution_helpers.py
index c18288a88..080676245 100644
--- a/scripts/noir_execution_helpers.py
+++ b/scripts/noir_execution_helpers.py
@@ -172,17 +172,17 @@ def _classify_failure(text: str, stage: str) -> str:
     return f"Unknown failure ({stage})"
 
 
-def build_grouped_report(log_dir: Path, passed_count: int) -> None:
+def build_grouped_report(log_dir: Path, passed: int, failed: int, skipped: int) -> None:
     """Scan ``log_dir/per_test/*.log`` and write ``log_dir/grouped_error_report.txt``.
 
-    PASS logs are deleted by the shell runner after each successful test, so
-    the PASS count is threaded in as ``passed_count`` rather than inferred.
+    PASS/FAIL/SKIP totals come from the shell runner — it has the authoritative
+    counts (including blackbox skips, which don't produce per-test logs). Logs
+    are consulted only for the ``[stages]`` and ``[grouped]`` sections.
     """
     per_test_dir = log_dir / "per_test"
     report_file = log_dir / "grouped_error_report.txt"
 
     logs = sorted(per_test_dir.glob("*.log"))
-    status_counts = {"PASS": passed_count, "FAIL": 0, "SKIP": 0}
     grouped: dict[str, list[str]] = defaultdict(list)
     stage_groups: dict[str, list[str]] = defaultdict(list)
 
@@ -191,13 +191,11 @@ def build_grouped_report(log_dir: Path, passed_count: int) -> None:
         name = fp.stem
 
         if "SKIP:" in text:
-            status_counts["SKIP"] += 1
             skip_match = _SKIP_REASON_RE.search(text)
             reason = skip_match.group(1).strip() if skip_match else "unknown"
             grouped[f"SKIP: {reason}"].append(name)
             continue
 
-        status_counts["FAIL"] += 1
         fail_stages = _FAIL_STAGE_RE.findall(text)
         stage = fail_stages[-1].strip() if fail_stages else "unknown stage"
         stage_groups[stage].append(name)
@@ -205,9 +203,9 @@ def build_grouped_report(log_dir: Path, passed_count: int) -> None:
 
     with report_file.open("w") as f:
         f.write(f"logs={len(logs)}\n")
-        f.write(f"PASS={status_counts['PASS']}\n")
-        f.write(f"FAIL={status_counts['FAIL']}\n")
-        f.write(f"SKIP={status_counts['SKIP']}\n")
+        f.write(f"PASS={passed}\n")
+        f.write(f"FAIL={failed}\n")
+        f.write(f"SKIP={skipped}\n")
         f.write("\n[stages]\n")
         for stage, tests in sorted(stage_groups.items(), key=lambda kv: (-len(kv[1]), kv[0])):
             f.write(f"{stage}\t{len(tests)}\t{', '.join(tests)}\n")
@@ -233,6 +231,8 @@ def main() -> int:
     p = sub.add_parser("build-report")
     p.add_argument("log_dir", type=Path)
     p.add_argument("passed_count", type=int)
+    p.add_argument("failed_count", type=int)
+    p.add_argument("skipped_count", type=int)
 
     sub.add_parser("skip-tests", help="print the skip list, one name per line")
 
@@ -246,7 +246,12 @@ def main() -> int:
     elif args.cmd == "package-name":
         print(read_package_name(args.project_dir))
     elif args.cmd == "build-report":
-        build_grouped_report(args.log_dir, args.passed_count)
+        build_grouped_report(
+            args.log_dir,
+            args.passed_count,
+            args.failed_count,
+            args.skipped_count,
+        )
     elif args.cmd == "skip-tests":
         for name in sorted(load_skip_tests()):
             print(name)
diff --git a/scripts/run_noir_execution_success.sh b/scripts/run_noir_execution_success.sh
index 930d606fb..47baad3cd 100755
--- a/scripts/run_noir_execution_success.sh
+++ b/scripts/run_noir_execution_success.sh
@@ -169,23 +169,26 @@ for test_name in "${test_dirs[@]}"; do
     continue
   fi
 
-  (( total += 1 ))
-
-  if [[ "${MAX_TESTS}" -gt 0 && "${total}" -gt "${MAX_TESTS}" ]]; then
-    break
-  fi
-
   # leaf name (no sub-path) is what we key on in the skip set
   leaf_name="${test_name%%/*}"
-  test_dir="${TEST_ROOT}/${test_name}"
-  safe_test_name="${test_name//\//__}"
+
   # --- Unimplemented blackbox skip list: no log, no noise ---
+  # Skip BEFORE incrementing `total` so MAX_TESTS caps only attempted tests.
   if [[ "${SKIP_SET["${leaf_name}"]:-}" == "1" ]]; then
     echo "SKIP (blackbox): ${test_name}"
     (( skipped += 1 ))
     continue
   fi
 
+  (( total += 1 ))
+
+  if [[ "${MAX_TESTS}" -gt 0 && "${total}" -gt "${MAX_TESTS}" ]]; then
+    break
+  fi
+
+  test_dir="${TEST_ROOT}/${test_name}"
+  safe_test_name="${test_name//\//__}"
+
   test_log="${LOG_DIR}/per_test/${safe_test_name}.log"
 
   echo ""
@@ -207,7 +210,7 @@ for test_name in "${test_dirs[@]}"; do
     continue
   fi
 
-  if [[ ! -d "${TEST_LIB_ROOT}" ]] && grep -qr 'test_libraries' "${test_dir}"/Nargo.toml 2>/dev/null; then
+  if [[ ! -d "${TEST_LIB_ROOT}" ]] && grep -q 'test_libraries' "${test_dir}"/Nargo.toml 2>/dev/null; then
     echo "SKIP: missing test_libraries for relative path dependency"
     append_stage_marker "${test_log}" "test" "SKIP"
     echo "SKIP: missing test_libraries for relative path dependency" >> "${test_log}"
@@ -374,7 +377,7 @@ echo "Failed           : ${failed}"
 echo "Skipped          : ${skipped}  (${#SKIP_TESTS[@]} unimplemented-blackbox tests)"
 echo "Log directory    : ${LOG_DIR}"
 
-python3 "${HELPER}" build-report "${LOG_DIR}" "${passed}"
+python3 "${HELPER}" build-report "${LOG_DIR}" "${passed}" "${failed}" "${skipped}"
 
 # Emit GitHub Step Summary when running inside Actions
 # (must be after the Python report generator so grouped_error_report.txt exists)

From 8e011067406449ffb8eb0d050ff3ff879679a334 Mon Sep 17 00:00:00 2001
From: x-senpai-x <sharmautsav0531@gmail.com>
Date: Wed, 22 Apr 2026 00:36:18 +0530
Subject: [PATCH 08/16] switched runner to ubuntu-24.04-arm

---
 .github/workflows/noir-execution-success.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/noir-execution-success.yml b/.github/workflows/noir-execution-success.yml
index 999247592..898764a5e 100644
--- a/.github/workflows/noir-execution-success.yml
+++ b/.github/workflows/noir-execution-success.yml
@@ -29,7 +29,7 @@ jobs:
     # Block fork PRs from executing arbitrary build scripts on the self-hosted runner.
     if: ${{ github.event_name == 'workflow_dispatch' || github.event.pull_request.head.repo.full_name == github.repository }}
     name: Noir execution_success suite (${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.noir_ref != '' && github.event.inputs.noir_ref || 'v1.0.0-beta.19') || 'v1.0.0-beta.19' }})
-    runs-on: [self-hosted, Linux, ARM64, provekit-build]
+    runs-on: ubuntu-24.04-arm
 
     steps:
       - uses: actions/checkout@v4

From d7eca9901db291e10e4d45138b2d780c6e884829 Mon Sep 17 00:00:00 2001
From: x-senpai-x <sharmautsav0531@gmail.com>
Date: Thu, 23 Apr 2026 02:49:19 +0530
Subject: [PATCH 09/16] removed mavros comparison

---
 .github/scripts/build_noir_pr_comment.py     |   8 +-
 .github/workflows/noir-execution-success.yml |   2 +-
 scripts/generate_provekit_witness_report.py  |  75 ++++
 scripts/generate_witness_comparison.py       | 351 -------------------
 scripts/noir_execution_helpers.py            |   2 +-
 scripts/noir_skip_tests.txt                  |   2 +-
 scripts/run_noir_execution_success.sh        |  26 +-
 7 files changed, 96 insertions(+), 370 deletions(-)
 create mode 100755 scripts/generate_provekit_witness_report.py
 delete mode 100644 scripts/generate_witness_comparison.py

diff --git a/.github/scripts/build_noir_pr_comment.py b/.github/scripts/build_noir_pr_comment.py
index bd760a354..168af9e2b 100644
--- a/.github/scripts/build_noir_pr_comment.py
+++ b/.github/scripts/build_noir_pr_comment.py
@@ -65,7 +65,7 @@ def compose_comment(
         else ""
     )
     witness_truncated_note = (
-        "\n_Witness comparison truncated to fit GitHub comment size limits._\n"
+        "\n_ProveKit witness report truncated to fit GitHub comment size limits._\n"
         if witness_truncated
         else ""
     )
@@ -94,7 +94,7 @@ def compose_comment(
         "</details>",
         "",
         "<details>",
-        "<summary><code>witness_comparison.md</code></summary>",
+        "<summary><code>provekit_witness_report.md</code></summary>",
         "",
         witness_report_text,
         witness_truncated_note,
@@ -149,7 +149,7 @@ def build_with_truncation(
 
         excess = len(comment) - MAX_COMMENT_CHARS
         witness_work, witness_changed = clip_tail(
-            witness_work, MIN_SECTION_CHARS, excess, "witness_comparison.md"
+            witness_work, MIN_SECTION_CHARS, excess, "provekit_witness_report.md"
         )
         witness_truncated = witness_truncated or witness_changed
         if witness_changed:
@@ -200,7 +200,7 @@ def main() -> None:
     args = parse_args()
 
     grouped_report_text = read_report(args.grouped_report, "grouped_error_report.txt")
-    witness_report_text = read_report(args.witness_report, "witness_comparison.md")
+    witness_report_text = read_report(args.witness_report, "provekit_witness_report.md")
 
     body = build_with_truncation(
         grouped_report_text=grouped_report_text,
diff --git a/.github/workflows/noir-execution-success.yml b/.github/workflows/noir-execution-success.yml
index 898764a5e..c69195907 100644
--- a/.github/workflows/noir-execution-success.yml
+++ b/.github/workflows/noir-execution-success.yml
@@ -89,7 +89,7 @@ jobs:
         run: |
           python3 .github/scripts/build_noir_pr_comment.py \
             --grouped-report "noir-execution-logs/grouped_error_report.txt" \
-            --witness-report "noir-execution-logs/witness_comparison.md" \
+            --witness-report "noir-execution-logs/provekit_witness_report.md" \
             --output "noir-execution-logs/pr_comment.md" \
             --run-id "${{ github.run_id }}" \
             --run-url "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" \
diff --git a/scripts/generate_provekit_witness_report.py b/scripts/generate_provekit_witness_report.py
new file mode 100755
index 000000000..e8bafc982
--- /dev/null
+++ b/scripts/generate_provekit_witness_report.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python3
+"""Generate a ProveKit-only witness count report.
+
+Usage: python3 generate_provekit_witness_report.py <witness_csv> <output_dir>
+
+Reads a CSV of post-GE constraint and witness counts produced by
+scripts/run_noir_execution_success.sh and writes provekit_witness_report.md
+to <output_dir>.
+"""
+
+from __future__ import annotations
+
+import csv
+import sys
+from pathlib import Path
+
+SKIP_LIST = Path(__file__).resolve().parent / "noir_skip_tests.txt"
+
+
+def load_skip_tests() -> set[str]:
+    if not SKIP_LIST.is_file():
+        return set()
+    skip: set[str] = set()
+    for raw in SKIP_LIST.read_text().splitlines():
+        line = raw.strip()
+        if not line or line.startswith("#"):
+            continue
+        skip.add(line)
+    return skip
+
+
+def main(csv_path: Path, out_dir: Path) -> None:
+    skip_tests = load_skip_tests()
+
+    rows: dict[str, tuple[int | None, int | None]] = {}
+    with csv_path.open() as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            leaf = row["test_name"].split("/")[-1]
+            if leaf in skip_tests:
+                continue
+
+            def _parse(key: str) -> int | None:
+                val = row.get(key, "")
+                try:
+                    return int(val)
+                except (TypeError, ValueError):
+                    return None
+
+            rows[leaf] = (_parse("provekit_constraints"), _parse("provekit_witnesses"))
+
+    lines = [
+        "# ProveKit Witness Counts",
+        "",
+        f"Captured post-GE constraint and witness counts for {len(rows)} circuits.",
+        "",
+        "| Test | Constraints (post-GE) | Witnesses (post-GE) |",
+        "|------|------------------------|----------------------|",
+    ]
+    for name in sorted(rows):
+        constraints, witnesses = rows[name]
+        c = "-" if constraints is None else str(constraints)
+        w = "-" if witnesses is None else str(witnesses)
+        lines.append(f"| {name} | {c} | {w} |")
+
+    out_path = out_dir / "provekit_witness_report.md"
+    out_path.write_text("\n".join(lines) + "\n")
+    print(f"Wrote {out_path} ({len(rows)} circuits)")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print(f"Usage: {sys.argv[0]} <witness_csv> <output_dir>", file=sys.stderr)
+        sys.exit(1)
+    main(Path(sys.argv[1]), Path(sys.argv[2]))
diff --git a/scripts/generate_witness_comparison.py b/scripts/generate_witness_comparison.py
deleted file mode 100644
index 75f16dfc9..000000000
--- a/scripts/generate_witness_comparison.py
+++ /dev/null
@@ -1,351 +0,0 @@
-#!/usr/bin/env python3
-"""Generate Mavros vs ProveKit witness count comparison table.
-
-Usage: python3 generate_witness_comparison.py <witness_csv> <output_dir>
-
-Reads provekit_witness_counts.csv produced by run_noir_execution_success.sh,
-joins against Mavros Cols from the live reilabs/mavros STATUS.md (with
-hardcoded fallback for any entries absent from the live file), and writes
-witness_comparison.md to <output_dir>.
-"""
-
-import csv
-import sys
-import urllib.request
-from pathlib import Path
-
-from noir_execution_helpers import load_skip_tests
-
-# Mavros Cols — extracted from reilabs/mavros STATUS.md (column "Cols"),
-# noir/test_programs/execution_success/* rows only. Keys are bare circuit names.
-MAVROS_COLS: dict[str, int] = {
-    "a_1327_concrete_in_generic": 4,
-    "a_1_mul": 577,
-    "a_2_div": 610,
-    "a_3_add": 567,
-    "a_4_sub": 670,
-    "a_5_over": 798,
-    "a_6_array": 5619,
-    "arithmetic_binary_operations": 654,
-    "array_dedup_regression": 523,
-    "array_eq": 161,
-    "array_neq": 161,
-    "array_oob_regression_7965": 551,
-    "array_oob_regression_7975": 6,
-    "array_rc_regression_7842": 1,
-    "array_with_refs_from_param": 3,
-    "as_witness": 3,
-    "assert": 4,
-    "assert_statement": 3,
-    "assign_ex": 3,
-    "bit_and": 1840,
-    "bit_not": 523,
-    "bool_not": 2,
-    "bool_or": 5,
-    "break_and_continue": 1,
-    "brillig_acir_as_brillig": 541,
-    "brillig_array_ifelse": 8,
-    "brillig_arrays": 4,
-    "brillig_block_parameter_liveness": 14119,
-    "brillig_calls": 541,
-    "brillig_calls_array": 550,
-    "brillig_calls_conditionals": 577,
-    "brillig_conditional": 5,
-    "brillig_constant_reference_regression": 532,
-    "brillig_cow": 543,
-    "brillig_cow_assign": 1,
-    "brillig_fns_as_values": 593,
-    "brillig_identity_function": 8,
-    "brillig_loop_size_regression": 3,
-    "brillig_nested_arrays": 533,
-    "brillig_not": 9,
-    "brillig_recursion": 532,
-    "brillig_recursive_main": 525,
-    "brillig_recursive_main_indirect": 525,
-    "brillig_uninitialized_arrays": 790,
-    "cast_bool": 5,
-    "cast_to_i8_regression_7776": 648,
-    "cast_to_u64_regression_7776": 662,
-    "cast_to_u8_regression_7776": 648,
-    "comptime_println": 1,
-    "comptime_println_fmtstr_with_quoted": 1,
-    "comptime_variable_at_runtime": 1,
-    "conditional_regression_547": 3,
-    "conditional_regression_underflow": 517,
-    "custom_entry": 2,
-    "databus": 610,
-    "databus_two_calldata_simple": 619,
-    "debug_logs": 3,
-    "diamond_deps_0": 4,
-    "division_by_max": 528,
-    "do_not_capture_comptime_locals": 1,
-    "double_neg_cond_bool_input": 4,
-    "double_neg_cond_global_var": 809,
-    "dual_constrained_lambdas": 3,
-    "field_attribute": 533,
-    "fmtstr_with_global": 1,
-    "fold_after_inlined_calls": 539,
-    "fold_basic": 9,
-    "fold_basic_nested_call": 7,
-    "fold_distinct_return": 540,
-    "generics": 3,
-    "global_consts": 175,
-    "global_nested_array_regression_9270": 526,
-    "global_var_entry_point_used_in_another_entry": 3,
-    "global_var_func_with_multiple_entry_points": 3,
-    "global_var_multiple_entry_points_nested": 3,
-    "inline_never_basic": 5,
-    "integer_array_indexing": 527,
-    "lambda_taking_lambda_with_variant": 548,
-    "last_uses_regression_8935": 524,
-    "loop": 524,
-    "loop_break_regression_8319": 1,
-    "loop_invariant_nested_deep": 2,
-    "loop_invariant_regression_8586": 2,
-    "loop_small_break": 2,
-    "main_return": 3,
-    "modules": 5,
-    "modules_more": 5,
-    "modulus": 864,
-    "mutate_array_copy": 1,
-    "negated_jmpif_condition": 5,
-    "negative_associated_constants": 1,
-    "nested_array_with_refs": 2,
-    "nested_array_with_refs_from_param": 3,
-    "nested_arrays_from_brillig": 19,
-    "nested_fmtstr": 1,
-    "no_predicates_basic": 5,
-    "no_predicates_brillig": 532,
-    "poseidon_bn254_hash_width_3": 552,
-    "poseidonsponge_x5_254": 608,
-    "pred_eq": 5,
-    "prelude": 1,
-    "reference_alias_in_array": 1,
-    "regression_10197": 523,
-    "regression_10307": 531,
-    "regression_10466": 1,
-    "regression_10516": 1,
-    "regression_10690": 1,
-    "regression_10917": 3,
-    "regression_10923": 5,
-    "regression_2660": 572,
-    "regression_3051": 1,
-    "regression_3394": 1,
-    "regression_3607": 596,
-    "regression_3889": 5,
-    "regression_4088": 2,
-    "regression_4124": 2,
-    "regression_4202": 550,
-    "regression_4663": 1,
-    "regression_5435": 3,
-    "regression_5615": 1,
-    "regression_6451": 11,
-    "regression_6674_1": 1,
-    "regression_6674_2": 1,
-    "regression_6734": 1,
-    "regression_6990": 1,
-    "regression_7143": 532,
-    "regression_7195": 12,
-    "regression_7451": 1304,
-    "regression_7962": 1461,
-    "regression_8174": 569,
-    "regression_8212": 524,
-    "regression_8235": 4,
-    "regression_8329": 7,
-    "regression_8519": 677,
-    "regression_8558": 657,
-    "regression_8739": 1,
-    "regression_8761": 2,
-    "regression_8874": 524,
-    "regression_8890": 6,
-    "regression_8926": 532,
-    "regression_8975": 2,
-    "regression_9037": 3,
-    "regression_9047": 521,
-    "regression_9102": 5,
-    "regression_9116": 1,
-    "regression_9160": 3,
-    "regression_9193": 2,
-    "regression_9206": 532,
-    "regression_9243": 1,
-    "regression_9294": 1,
-    "regression_9329": 523,
-    "regression_9546": 529,
-    "regression_9657": 523,
-    "regression_9725_1": 1,
-    "regression_9725_2": 2,
-    "regression_9907": 3,
-    "regression_method_cannot_be_found": 1,
-    "return_twice": 5,
-    "shift_right_overflow": 517,
-    "shl_signed_regression_9661": 520,
-    "signed_bitshift": 1,
-    "signed_overflow_in_else_regression_8617": 660,
-    "signed_truncation": 918,
-    "simple_2d_array": 13,
-    "simple_add_and_ret_arr": 3,
-    "simple_array_param": 4,
-    "simple_bitwise": 1355,
-    "simple_comparison": 784,
-    "simple_mut": 3,
-    "simple_not": 3,
-    "simple_print": 3,
-    "simple_program_addition": 3,
-    "struct": 5,
-    "struct_array_inputs": 8,
-    "struct_fields_ordering": 524,
-    "submodules": 4,
-    "trait_as_return_type": 523,
-    "trait_associated_constant": 1,
-    "trait_impl_base_type": 523,
-    "traits_in_crates_1": 3,
-    "traits_in_crates_2": 3,
-    "tuple_inputs": 845,
-    "tuples": 657,
-    "type_aliases": 5,
-    "unsafe_range_constraint": 527,
-    "unsigned_to_signed_cast": 918,
-    "while_loop_break_regression_8521": 541,
-    "wildcard_type": 7,
-    "witness_compression": 6,
-    "workspace_default_member": 3,
-    "wrapping_operations": 908,
-    "xor": 1367,
-}
-
-
-_MAVROS_STATUS_URL = (
-    "https://raw.githubusercontent.com/reilabs/mavros/main/STATUS.md"
-)
-_EXEC_SUCCESS_PREFIX = "noir/test_programs/execution_success/"
-
-# Skip list is shared with scripts/run_noir_execution_success.sh via
-# scripts/noir_skip_tests.txt; these circuits are excluded from both sides
-# of the witness comparison.
-SKIP_TESTS: set[str] = load_skip_tests()
-
-
-def _fetch_live_mavros_cols() -> dict[str, int]:
-    """Parse Mavros Cols from the live reilabs/mavros STATUS.md.
-
-    Returns an empty dict on any network or parse failure so the caller
-    can fall back gracefully to the hardcoded table.
-    """
-    try:
-        with urllib.request.urlopen(_MAVROS_STATUS_URL, timeout=15) as resp:
-            content = resp.read().decode("utf-8")
-    except Exception as exc:
-        print(
-            f"Warning: could not fetch live Mavros STATUS.md ({exc}); "
-            "falling back to hardcoded data.",
-            file=sys.stderr,
-        )
-        return {}
-
-    result: dict[str, int] = {}
-    for line in content.splitlines():
-        if _EXEC_SUCCESS_PREFIX not in line:
-            continue
-        parts = line.split("|")
-        # Table row: | test | Compiled | R1CS | Rows | Cols | ...
-        # After split: parts[1]=test, parts[5]=Cols (1-indexed, 0 is empty)
-        if len(parts) < 6:
-            continue
-        name_field = parts[1].strip()
-        cols_field = parts[5].strip()
-        if _EXEC_SUCCESS_PREFIX not in name_field:
-            continue
-        circuit = name_field.split(_EXEC_SUCCESS_PREFIX, 1)[1].strip()
-        if circuit and cols_field.isdigit():
-            result[circuit] = int(cols_field)
-
-    if result:
-        print(f"Fetched {len(result)} Mavros entries from live STATUS.md.")
-    else:
-        print("Warning: live STATUS.md parsed 0 entries; using hardcoded data.", file=sys.stderr)
-    return result
-
-
-def main(csv_path: Path, out_dir: Path) -> None:
-    # Live data takes precedence; hardcoded fills any gaps.
-    live = _fetch_live_mavros_cols()
-    mavros_cols = {
-        name: cols
-        for name, cols in {**MAVROS_COLS, **live}.items()
-        if name not in SKIP_TESTS
-    }
-
-    provekit: dict[str, int] = {}
-    with csv_path.open() as f:
-        for row in csv.DictReader(f):
-            leaf = row["test_name"].split("/")[-1]
-            if leaf in SKIP_TESTS:
-                continue
-            try:
-                provekit[leaf] = int(row["provekit_witnesses"])
-            except (ValueError, KeyError):
-                continue
-
-    all_names = sorted(set(mavros_cols) | set(provekit))
-    comparable = [
-        (name, mavros_cols[name], provekit[name])
-        for name in all_names
-        if name in mavros_cols and name in provekit
-    ]
-    missing_in_provekit = sum(1 for name in all_names if name in mavros_cols and name not in provekit)
-    missing_in_mavros = sum(1 for name in all_names if name in provekit and name not in mavros_cols)
-
-    equal = sum(1 for _, m, p in comparable if m == p)
-    mavros_better = sum(1 for _, m, p in comparable if m < p)
-    provekit_better = sum(1 for _, m, p in comparable if p < m)
-
-    lines = [
-        "# Mavros vs ProveKit Witnesses Count",
-        "",
-        f"Union {len(all_names)} circuits: {len(comparable)} comparable, "
-        f"{missing_in_provekit} missing in ProveKit, {missing_in_mavros} missing in Mavros.",
-        f"Among comparable: {equal} equal, {mavros_better} Mavros better, "
-        f"{provekit_better} ProveKit better.",
-        "",
-        "| Test | Mavros Cols | ProveKit Post-GE | Delta | Better | Factor |",
-        "|------|-------------|------------------|-------|--------|--------|",
-    ]
-
-    for name in all_names:
-        mavros = mavros_cols.get(name)
-        pk = provekit.get(name)
-
-        if mavros is None:
-            lines.append(f"| {name} | - | {pk} | - | missing_mavros | - |")
-            continue
-
-        if pk is None:
-            lines.append(f"| {name} | {mavros} | - | - | missing_provekit | - |")
-            continue
-
-        delta = pk - mavros
-        delta_str = f"+{delta}" if delta > 0 else str(delta)
-        if mavros == pk:
-            better = "equal"
-            factor = "1.00x"
-        elif pk < mavros:
-            better = "provekit"
-            factor = "inf" if pk == 0 else f"{mavros / pk:.2f}x"
-        else:
-            better = "mavros"
-            factor = "inf" if mavros == 0 else f"{pk / mavros:.2f}x"
-        lines.append(f"| {name} | {mavros} | {pk} | {delta_str} | {better} | {factor} |")
-
-    out_path = out_dir / "witness_comparison.md"
-    out_path.write_text("\n".join(lines) + "\n")
-    print(
-        f"Wrote {out_path} "
-        f"({len(all_names)} total circuits, {len(comparable)} comparable)"
-    )
-
-
-if __name__ == "__main__":
-    if len(sys.argv) != 3:
-        print(f"Usage: {sys.argv[0]} <witness_csv> <output_dir>", file=sys.stderr)
-        sys.exit(1)
-    main(Path(sys.argv[1]), Path(sys.argv[2]))
diff --git a/scripts/noir_execution_helpers.py b/scripts/noir_execution_helpers.py
index 080676245..06ffb52d1 100644
--- a/scripts/noir_execution_helpers.py
+++ b/scripts/noir_execution_helpers.py
@@ -9,7 +9,7 @@
   skip-tests                                — print the skip list (one per line)
 
 The skip list lives in scripts/noir_skip_tests.txt and is the single source
-of truth shared with scripts/generate_witness_comparison.py.
+of truth shared with scripts/generate_provekit_witness_report.py.
 """
 
 from __future__ import annotations
diff --git a/scripts/noir_skip_tests.txt b/scripts/noir_skip_tests.txt
index 31b26c337..575dd9d0c 100644
--- a/scripts/noir_skip_tests.txt
+++ b/scripts/noir_skip_tests.txt
@@ -1,6 +1,6 @@
 # Tests that use blackbox functions not yet supported by provekit.
 # Counted as SKIP (not FAIL) by scripts/run_noir_execution_success.sh
-# and excluded from scripts/generate_witness_comparison.py.
+# and excluded from scripts/generate_provekit_witness_report.py.
 # Remove entries here once the corresponding blackbox is supported.
 #
 # Format: one bare test name per line. Blank lines and `#` comments are ignored.
diff --git a/scripts/run_noir_execution_success.sh b/scripts/run_noir_execution_success.sh
index 47baad3cd..50848f466 100755
--- a/scripts/run_noir_execution_success.sh
+++ b/scripts/run_noir_execution_success.sh
@@ -50,7 +50,7 @@ fi
 # ---------------------------------------------------------------------------
 # Unimplemented-blackbox skip list
 # Single source of truth: scripts/noir_skip_tests.txt (shared with
-# scripts/generate_witness_comparison.py). Counted as SKIP (not FAIL).
+# scripts/generate_provekit_witness_report.py). Counted as SKIP (not FAIL).
 # ---------------------------------------------------------------------------
 SKIP_TESTS=()
 declare -A SKIP_SET
@@ -107,7 +107,7 @@ fi
 mkdir -p "${LOG_DIR}/per_test"
 GROUPED_REPORT_FILE="${LOG_DIR}/grouped_error_report.txt"
 WITNESS_CSV="${LOG_DIR}/provekit_witness_counts.csv"
-echo "test_name,provekit_witnesses" > "${WITNESS_CSV}"
+echo "test_name,provekit_constraints,provekit_witnesses" > "${WITNESS_CSV}"
 
 shopt -s nullglob globstar
 
@@ -318,15 +318,17 @@ for test_name in "${test_dirs[@]}"; do
   fi
   append_stage_marker "${test_log}" "provekit-cli prepare" "PASS"
 
-  # Extract ProveKit post-GE witness count before the log is deleted on success.
+  # Extract ProveKit post-GE constraint and witness counts before the log is deleted on success.
   # Keep this non-fatal under `set -euo pipefail` if the log format changes/misses.
   _ge_line="$(grep -o 'After GE optimization: [0-9]* constraints, [0-9]* witnesses' "${test_log}" | tail -1 || true)"
+  _pk_constraints=""
   _pk_witnesses=""
-  if [[ "${_ge_line}" =~ ([0-9]+)\ witnesses$ ]]; then
-    _pk_witnesses="${BASH_REMATCH[1]}"
+  if [[ "${_ge_line}" =~ ([0-9]+)\ constraints,\ ([0-9]+)\ witnesses$ ]]; then
+    _pk_constraints="${BASH_REMATCH[1]}"
+    _pk_witnesses="${BASH_REMATCH[2]}"
   fi
   if [[ -n "${_pk_witnesses}" ]]; then
-    echo "${test_name},${_pk_witnesses}" >> "${WITNESS_CSV}"
+    echo "${test_name},${_pk_constraints},${_pk_witnesses}" >> "${WITNESS_CSV}"
   fi
 
   append_stage_marker "${test_log}" "provekit-cli prove" "START"
@@ -404,16 +406,16 @@ fi
 
 echo "Grouped report  : ${GROUPED_REPORT_FILE}"
 
-# Generate Mavros vs ProveKit witness comparison table
-if [[ -f "${WITNESS_CSV}" ]] && python3 "${SCRIPT_DIR}/generate_witness_comparison.py" "${WITNESS_CSV}" "${LOG_DIR}"; then
-  echo "Witness comparison: ${LOG_DIR}/witness_comparison.md"
+# Generate ProveKit witness count report
+if [[ -f "${WITNESS_CSV}" ]] && python3 "${SCRIPT_DIR}/generate_provekit_witness_report.py" "${WITNESS_CSV}" "${LOG_DIR}"; then
+  echo "ProveKit witness report: ${LOG_DIR}/provekit_witness_report.md"
   if [[ -n "${GITHUB_STEP_SUMMARY:-}" ]]; then
     {
       echo ""
-      echo "## Mavros vs ProveKit Witness Count"
-      head -4 "${LOG_DIR}/witness_comparison.md"
+      echo "## ProveKit Witness Counts"
+      head -4 "${LOG_DIR}/provekit_witness_report.md"
       echo ""
-      echo "_Full table available in artifact: \`witness_comparison.md\`_"
+      echo "_Full table available in artifact: \`provekit_witness_report.md\`_"
     } >> "${GITHUB_STEP_SUMMARY}"
   fi
 fi

From 6390e92768574ceb3fa2266f9848a6d64cdb59c8 Mon Sep 17 00:00:00 2001
From: Aditya Bisht <adityabisht64@gmail.com>
Date: Tue, 28 Apr 2026 13:40:58 +0530
Subject: [PATCH 10/16] ci(noir): replace witness report in PR comment with
 failing-circuits list

Drop the provekit_witness_report.md table from the sticky PR comment and
add a "Failing circuits (N)" details section parsed from the [stages]
block of grouped_error_report.txt, alongside the grouped report itself.
---
 .github/scripts/build_noir_pr_comment.py     | 77 ++++++++++++--------
 .github/workflows/noir-execution-success.yml |  1 -
 2 files changed, 45 insertions(+), 33 deletions(-)

diff --git a/.github/scripts/build_noir_pr_comment.py b/.github/scripts/build_noir_pr_comment.py
index 168af9e2b..5959ea185 100644
--- a/.github/scripts/build_noir_pr_comment.py
+++ b/.github/scripts/build_noir_pr_comment.py
@@ -30,6 +30,36 @@ def parse_grouped_counts(grouped_report_text: str) -> dict[str, str]:
     return counts
 
 
+def parse_failing_circuits(grouped_report_text: str) -> list[str]:
+    """Extract the flat sorted list of failing circuits from the [stages] section.
+
+    The grouped report's [stages] section only contains failing tests (skipped
+    tests are routed to [grouped] instead). Each line looks like:
+        <stage>\\t<count>\\t<name1>, <name2>, ...
+    """
+    match = re.search(
+        r"^\[stages\]\n(.*?)(?:\n\[|\Z)",
+        grouped_report_text,
+        flags=re.DOTALL | re.MULTILINE,
+    )
+    if not match:
+        return []
+
+    names: set[str] = set()
+    for line in match.group(1).splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        parts = line.split("\t")
+        if len(parts) < 3:
+            continue
+        for raw in parts[2].split(","):
+            name = raw.strip()
+            if name:
+                names.add(name)
+    return sorted(names)
+
+
 def status_with_icon(status: str) -> str:
     normalized = (status or "unknown").strip().lower()
     labels = {
@@ -47,9 +77,7 @@ def sanitize_code_fence(text: str) -> str:
 
 def compose_comment(
     grouped_report_text: str,
-    witness_report_text: str,
     grouped_truncated: bool,
-    witness_truncated: bool,
     run_id: str,
     run_url: str,
     sha: str,
@@ -64,11 +92,14 @@ def compose_comment(
         if grouped_truncated
         else ""
     )
-    witness_truncated_note = (
-        "\n_ProveKit witness report truncated to fit GitHub comment size limits._\n"
-        if witness_truncated
-        else ""
-    )
+
+    failing_circuits = parse_failing_circuits(grouped_report_text)
+    if failing_circuits:
+        failing_body = "\n".join(f"- `{name}`" for name in failing_circuits)
+        failing_summary = f"Failing circuits ({len(failing_circuits)})"
+    else:
+        failing_body = "_No failing circuits._"
+        failing_summary = "Failing circuits (0)"
 
     lines = [
         MARKER,
@@ -85,6 +116,13 @@ def compose_comment(
         f"| SKIP | {counts['SKIP']} |",
         "",
         "<details>",
+        f"<summary>{failing_summary}</summary>",
+        "",
+        failing_body,
+        "",
+        "</details>",
+        "",
+        "<details>",
         "<summary><code>grouped_error_report.txt</code></summary>",
         "",
         "```text",
@@ -93,13 +131,6 @@ def compose_comment(
         grouped_truncated_note,
         "</details>",
         "",
-        "<details>",
-        "<summary><code>provekit_witness_report.md</code></summary>",
-        "",
-        witness_report_text,
-        witness_truncated_note,
-        "</details>",
-        "",
         "_This comment is automatically updated by the Noir Execution Success workflow._",
         "",
     ]
@@ -120,7 +151,6 @@ def clip_tail(text: str, min_chars: int, excess: int, label: str) -> tuple[str,
 
 def build_with_truncation(
     grouped_report_text: str,
-    witness_report_text: str,
     run_id: str,
     run_url: str,
     sha: str,
@@ -128,16 +158,12 @@ def build_with_truncation(
     status: str,
 ) -> str:
     grouped_work = grouped_report_text
-    witness_work = witness_report_text
     grouped_truncated = False
-    witness_truncated = False
 
     for _ in range(128):
         comment = compose_comment(
             grouped_work,
-            witness_work,
             grouped_truncated=grouped_truncated,
-            witness_truncated=witness_truncated,
             run_id=run_id,
             run_url=run_url,
             sha=sha,
@@ -148,13 +174,6 @@ def build_with_truncation(
             return comment
 
         excess = len(comment) - MAX_COMMENT_CHARS
-        witness_work, witness_changed = clip_tail(
-            witness_work, MIN_SECTION_CHARS, excess, "provekit_witness_report.md"
-        )
-        witness_truncated = witness_truncated or witness_changed
-        if witness_changed:
-            continue
-
         grouped_work, grouped_changed = clip_tail(
             grouped_work, MIN_SECTION_CHARS, excess, "grouped_error_report.txt"
         )
@@ -164,12 +183,9 @@ def build_with_truncation(
 
         break
 
-    # Final hard guard if both reports are already near minimum length.
     fallback = compose_comment(
         grouped_work,
-        witness_work,
         grouped_truncated=True,
-        witness_truncated=True,
         run_id=run_id,
         run_url=run_url,
         sha=sha,
@@ -186,7 +202,6 @@ def build_with_truncation(
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(description=__doc__)
     parser.add_argument("--grouped-report", required=True, type=Path)
-    parser.add_argument("--witness-report", required=True, type=Path)
     parser.add_argument("--output", required=True, type=Path)
     parser.add_argument("--run-id", required=True)
     parser.add_argument("--run-url", required=True)
@@ -200,11 +215,9 @@ def main() -> None:
     args = parse_args()
 
     grouped_report_text = read_report(args.grouped_report, "grouped_error_report.txt")
-    witness_report_text = read_report(args.witness_report, "provekit_witness_report.md")
 
     body = build_with_truncation(
         grouped_report_text=grouped_report_text,
-        witness_report_text=witness_report_text,
         run_id=args.run_id,
         run_url=args.run_url,
         sha=args.sha,
diff --git a/.github/workflows/noir-execution-success.yml b/.github/workflows/noir-execution-success.yml
index c69195907..6cf7a64f7 100644
--- a/.github/workflows/noir-execution-success.yml
+++ b/.github/workflows/noir-execution-success.yml
@@ -89,7 +89,6 @@ jobs:
         run: |
           python3 .github/scripts/build_noir_pr_comment.py \
             --grouped-report "noir-execution-logs/grouped_error_report.txt" \
-            --witness-report "noir-execution-logs/provekit_witness_report.md" \
             --output "noir-execution-logs/pr_comment.md" \
             --run-id "${{ github.run_id }}" \
             --run-url "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" \

From 5f0a31907a5716861c28cd5cd1d320f529bf7edd Mon Sep 17 00:00:00 2001
From: Aditya Bisht <adityabisht64@gmail.com>
Date: Tue, 28 Apr 2026 13:52:46 +0530
Subject: [PATCH 11/16] ci(bench): add CSP benchmarks workflow with sticky PR
 comment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a separate Noir-CSP benchmark suite that runs prepare/prove/verify
on every circuit under noir-examples/csp-benchmarks/ and posts a sticky
PR comment with prover time, peak RSS, peak heap, verifier time, proof
size, and PKP size — averaged over BENCH_RUNS (default 3) iterations.

- scripts/run_csp_benchmarks.sh: bench runner. Per circuit: nargo
  compile + provekit-cli prepare once, then prove and verify each three
  times under /usr/bin/time -f '%e %M', capturing prover stderr so the
  helper can extract the largest "peak memory" entry from the
  span_stats trace.
- scripts/csp_benchmark_helpers.py: parses /usr/bin/time output and the
  SI-formatted peak-memory entries (with U+202F separator), converts
  to bytes, and emits a single averaged CSV row per circuit.
- .github/scripts/build_csp_pr_comment.py: renders results.csv as a
  markdown table and wraps it with a <!-- csp-benchmarks-report -->
  marker so it lives in its own sticky comment (no race with the
  noir-execution-success workflow).
- .github/workflows/csp-benchmarks.yml: ubuntu-24.04-arm, runs on PRs
  from this repo and on workflow_dispatch (with a configurable
  bench_runs override).
---
 .github/scripts/build_csp_pr_comment.py | 164 +++++++++++++++++
 .github/workflows/csp-benchmarks.yml    | 131 +++++++++++++
 scripts/csp_benchmark_helpers.py        | 191 +++++++++++++++++++
 scripts/run_csp_benchmarks.sh           | 234 ++++++++++++++++++++++++
 4 files changed, 720 insertions(+)
 create mode 100644 .github/scripts/build_csp_pr_comment.py
 create mode 100644 .github/workflows/csp-benchmarks.yml
 create mode 100755 scripts/csp_benchmark_helpers.py
 create mode 100755 scripts/run_csp_benchmarks.sh

diff --git a/.github/scripts/build_csp_pr_comment.py b/.github/scripts/build_csp_pr_comment.py
new file mode 100644
index 000000000..c291080db
--- /dev/null
+++ b/.github/scripts/build_csp_pr_comment.py
@@ -0,0 +1,164 @@
+#!/usr/bin/env python3
+"""Build a sticky PR comment for the CSP benchmarks workflow.
+
+Reads the CSV emitted by ``scripts/run_csp_benchmarks.sh`` (one row per
+circuit) and renders it as a markdown table with human-readable units.
+"""
+
+from __future__ import annotations
+
+import argparse
+import csv
+from pathlib import Path
+
+MARKER = "<!-- csp-benchmarks-report -->"
+MAX_COMMENT_CHARS = 62000
+
+
+def fmt_bytes(value: float) -> str:
+    if value <= 0:
+        return "—"
+    units = ("B", "KB", "MB", "GB", "TB")
+    idx = 0
+    while value >= 1024 and idx < len(units) - 1:
+        value /= 1024.0
+        idx += 1
+    if value >= 100 or idx == 0:
+        return f"{value:.0f} {units[idx]}"
+    return f"{value:.2f} {units[idx]}"
+
+
+def fmt_kb_to_bytes(rss_kb: float) -> str:
+    return fmt_bytes(rss_kb * 1024.0)
+
+
+def fmt_ms(ms: float) -> str:
+    if ms <= 0:
+        return "—"
+    if ms < 1000:
+        return f"{ms:.0f} ms"
+    return f"{ms / 1000.0:.2f} s"
+
+
+def status_with_icon(status: str) -> str:
+    normalized = (status or "unknown").strip().lower()
+    labels = {
+        "success": "[PASS]",
+        "failure": "[FAIL]",
+        "cancelled": "[CANCELLED]",
+        "skipped": "[SKIPPED]",
+    }
+    return f"{labels.get(normalized, '[INFO]')} {normalized}"
+
+
+def read_rows(csv_path: Path) -> list[dict[str, str]]:
+    if not csv_path.is_file():
+        return []
+    with csv_path.open(newline="") as f:
+        return list(csv.DictReader(f))
+
+
+def render_table(rows: list[dict[str, str]]) -> str:
+    if not rows:
+        return "_No benchmark results were produced._"
+
+    header = (
+        "| Circuit | Prover time | Peak RSS | Peak heap | Verifier time | "
+        "Proof size | PKP size | Runs |"
+    )
+    sep = "|---|---:|---:|---:|---:|---:|---:|---:|"
+    lines = [header, sep]
+    for row in sorted(rows, key=lambda r: r.get("circuit", "")):
+        lines.append(
+            "| "
+            + " | ".join(
+                [
+                    f"`{row['circuit']}`",
+                    fmt_ms(float(row.get("prover_time_ms", 0) or 0)),
+                    fmt_kb_to_bytes(float(row.get("prover_peak_rss_kb", 0) or 0)),
+                    fmt_bytes(float(row.get("prover_heap_peak_bytes", 0) or 0)),
+                    fmt_ms(float(row.get("verifier_time_ms", 0) or 0)),
+                    fmt_bytes(float(row.get("proof_size_bytes", 0) or 0)),
+                    fmt_bytes(float(row.get("pkp_size_bytes", 0) or 0)),
+                    row.get("runs", "—"),
+                ]
+            )
+            + " |"
+        )
+    return "\n".join(lines)
+
+
+def compose_comment(
+    rows: list[dict[str, str]],
+    run_id: str,
+    run_url: str,
+    sha: str,
+    status: str,
+    runs_per_circuit: str,
+) -> str:
+    short_sha = sha[:12] if sha else "unknown"
+    table = render_table(rows)
+    lines = [
+        MARKER,
+        "## CSP benchmarks",
+        "",
+        "| Metric | Value |",
+        "|--------|-------|",
+        f"| Workflow status | {status_with_icon(status)} |",
+        f"| Commit | `{short_sha}` |",
+        f"| Run | [#{run_id}]({run_url}) |",
+        f"| Circuits benchmarked | {len(rows)} |",
+        f"| Iterations averaged per circuit | {runs_per_circuit} |",
+        "",
+        "Prover time, peak RSS, peak heap, and verifier time are arithmetic means "
+        "across the iterations. Peak heap comes from the largest "
+        "`peak memory` entry in `provekit-cli prove`'s tracing output; peak RSS "
+        "is reported by `/usr/bin/time -v` (max-resident-set-size).",
+        "",
+        "<details open>",
+        "<summary>Results</summary>",
+        "",
+        table,
+        "",
+        "</details>",
+        "",
+        "_This comment is automatically updated by the CSP Benchmarks workflow._",
+        "",
+    ]
+    return "\n".join(lines)
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--results-csv", required=True, type=Path)
+    parser.add_argument("--output", required=True, type=Path)
+    parser.add_argument("--run-id", required=True)
+    parser.add_argument("--run-url", required=True)
+    parser.add_argument("--sha", required=True)
+    parser.add_argument("--status", required=True)
+    parser.add_argument("--runs-per-circuit", required=True)
+    return parser.parse_args()
+
+
+def main() -> None:
+    args = parse_args()
+    rows = read_rows(args.results_csv)
+    body = compose_comment(
+        rows=rows,
+        run_id=args.run_id,
+        run_url=args.run_url,
+        sha=args.sha,
+        status=args.status,
+        runs_per_circuit=args.runs_per_circuit,
+    )
+    if len(body) > MAX_COMMENT_CHARS:
+        cut = body[: MAX_COMMENT_CHARS - 80].rstrip()
+        body = f"{cut}\n\n_Comment truncated due to GitHub size limits._\n"
+
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    args.output.write_text(body, encoding="utf-8")
+    print(f"Wrote PR comment body to {args.output} ({len(body)} chars)")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/workflows/csp-benchmarks.yml b/.github/workflows/csp-benchmarks.yml
new file mode 100644
index 000000000..8dc95147e
--- /dev/null
+++ b/.github/workflows/csp-benchmarks.yml
@@ -0,0 +1,131 @@
+name: CSP Benchmarks
+
+on:
+  pull_request:
+  workflow_dispatch:
+    inputs:
+      bench_runs:
+        description: "Iterations per circuit (default: 3)"
+        required: false
+        default: "3"
+
+permissions:
+  contents: read
+  pull-requests: write
+  issues: write
+
+env:
+  CARGO_TERM_COLOR: always
+  BENCH_RUNS: ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.bench_runs != '' && github.event.inputs.bench_runs || '3') || '3' }}
+  REQUIRED_NARGO_VERSION: "1.0.0-beta.19"
+
+concurrency:
+  group: csp-benchmarks-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  csp-benchmarks:
+    # Block fork PRs from running our heavy bench script on the runner.
+    if: ${{ github.event_name == 'workflow_dispatch' || github.event.pull_request.head.repo.full_name == github.repository }}
+    name: CSP benchmarks (avg over ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.bench_runs != '' && github.event.inputs.bench_runs || '3') || '3' }} runs)
+    runs-on: ubuntu-24.04-arm
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Setup Rust toolchain
+        uses: moonrepo/setup-rust@v1
+        with:
+          channel: nightly-2026-03-04
+          cache-base: main
+
+      - name: Build provekit-cli (release)
+        run: cargo build --release --bin provekit-cli
+
+      - name: Setup Noir toolchain
+        uses: noir-lang/noirup@v0.1.2
+        with:
+          toolchain: ${{ env.REQUIRED_NARGO_VERSION }}
+
+      - name: Run CSP benchmarks
+        env:
+          PROVEKIT_BIN: ${{ github.workspace }}/target/release/provekit-cli
+          BENCH_DIR: ${{ github.workspace }}/csp-bench-logs
+          BENCH_RUNS: ${{ env.BENCH_RUNS }}
+        run: |
+          bash scripts/run_csp_benchmarks.sh
+
+      - name: Upload bench artifacts
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: csp-bench-logs-${{ github.run_id }}
+          path: csp-bench-logs/
+          retention-days: 7
+
+      - name: Build sticky PR comment body
+        if: always() && github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository
+        continue-on-error: true
+        run: |
+          python3 .github/scripts/build_csp_pr_comment.py \
+            --results-csv "csp-bench-logs/results.csv" \
+            --output "csp-bench-logs/pr_comment.md" \
+            --run-id "${{ github.run_id }}" \
+            --run-url "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" \
+            --sha "${{ github.sha }}" \
+            --status "${{ job.status }}" \
+            --runs-per-circuit "${{ env.BENCH_RUNS }}"
+
+      - name: Upsert sticky CSP benchmarks comment
+        if: always() && github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository
+        continue-on-error: true
+        uses: actions/github-script@v7
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            const fs = require('fs');
+            const marker = '<!-- csp-benchmarks-report -->';
+            const bodyPath = 'csp-bench-logs/pr_comment.md';
+            const fallbackBody = [
+              marker,
+              '## CSP benchmarks',
+              '',
+              'Unable to generate the detailed report body for this run.',
+              '',
+              'Run: [#${{ github.run_id }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})',
+            ].join('\n');
+            const body = fs.existsSync(bodyPath) ? fs.readFileSync(bodyPath, 'utf8') : fallbackBody;
+
+            const { owner, repo } = context.repo;
+            const issue_number = context.issue.number;
+            const comments = await github.paginate(github.rest.issues.listComments, {
+              owner,
+              repo,
+              issue_number,
+              per_page: 100,
+            });
+
+            const existing = comments.find((comment) =>
+              comment.user &&
+              comment.user.login === 'github-actions[bot]' &&
+              comment.body &&
+              comment.body.includes(marker)
+            );
+
+            if (existing) {
+              await github.rest.issues.updateComment({
+                owner,
+                repo,
+                comment_id: existing.id,
+                body,
+              });
+              core.info(`Updated existing CSP benchmarks comment (id=${existing.id}).`);
+            } else {
+              const created = await github.rest.issues.createComment({
+                owner,
+                repo,
+                issue_number,
+                body,
+              });
+              core.info(`Created new CSP benchmarks comment (id=${created.data.id}).`);
+            }
diff --git a/scripts/csp_benchmark_helpers.py b/scripts/csp_benchmark_helpers.py
new file mode 100755
index 000000000..5333ece79
--- /dev/null
+++ b/scripts/csp_benchmark_helpers.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python3
+"""Helpers for scripts/run_csp_benchmarks.sh.
+
+Subcommands:
+  parse-runs <bench_dir> <circuit>  Aggregate per-run measurements for one
+                                    circuit and emit a single CSV row to stdout.
+  human-to-bytes <value>            Convert a human-formatted byte string from
+                                    the prover trace ("1.23 GB", "456 MB", etc.)
+                                    to an integer byte count. Used by tests.
+
+Bench layout produced by run_csp_benchmarks.sh::
+
+    <bench_dir>/per_circuit/<circuit>/
+        prove_<i>.time          # `/usr/bin/time -f '%e %M'` output
+        prove_<i>.stderr        # provekit-cli prove stderr (span_stats trace)
+        verify_<i>.time
+        verify_<i>.stderr
+        meta.txt                # key=value: pkp_size, proof_size
+
+The "peak heap" comes from the largest "peak memory: <SI>B" entry emitted by
+``tooling/cli/src/span_stats.rs`` over the prove invocation's trace. We strip
+ANSI escapes and walk every span-close line; the outermost span propagates
+its children's peak via ``data.peak_memory = max(...)`` so any of them is a
+sufficient upper bound, but we keep the max for safety.
+"""
+
+from __future__ import annotations
+
+import argparse
+import re
+import sys
+from pathlib import Path
+from statistics import mean
+
+ANSI_RE = re.compile(r"\x1b\[[0-9;]*m")
+# Suffix table from provekit_common::utils::human (BN254 utils). The middle
+# entry is a regular ASCII space (no SI prefix). Order matters: we use it to
+# look up the multiplier from a captured suffix character.
+SI_SUFFIXES = "qryzafpnμm kMGTPEZYRQ"
+SI_BASE_INDEX = SI_SUFFIXES.index(" ")  # power 0 lives at index 10
+# The separator between number and SI suffix is U+202F NARROW NO-BREAK SPACE
+# unless `{:#}` (alternate) is used. We accept either form.
+NARROW_NBSP = " "
+PEAK_MEMORY_RE = re.compile(
+    rf"([0-9]+(?:\.[0-9]+)?)[{NARROW_NBSP} ]?([qryzafpnμmkMGTPEZYRQ])?B"
+    r"\s+peak\s+memory",
+)
+
+
+def human_to_bytes(value: str) -> int:
+    """Convert a "1.23 GB"-style string from the trace to an integer byte count.
+
+    Accepts either a regular ASCII space or U+202F as the separator. Suffixes
+    follow ``provekit_common::utils::human`` (q…Q). A literal "B" with no SI
+    prefix returns the integer/float value rounded down.
+    """
+    cleaned = ANSI_RE.sub("", value).strip()
+    if not cleaned.endswith("B"):
+        raise ValueError(f"not a byte-formatted value: {value!r}")
+    cleaned = cleaned[:-1].rstrip()  # drop trailing 'B'
+    if cleaned and cleaned[-1] in SI_SUFFIXES and cleaned[-1] != " ":
+        suffix = cleaned[-1]
+        number_part = cleaned[:-1].rstrip()
+    else:
+        suffix = " "
+        number_part = cleaned
+    number_part = number_part.replace(NARROW_NBSP, "").strip()
+    multiplier = 10 ** ((SI_SUFFIXES.index(suffix) - SI_BASE_INDEX) * 3)
+    return int(float(number_part) * multiplier)
+
+
+def parse_peak_heap_bytes(stderr_path: Path) -> int:
+    """Return the largest "peak memory" value (bytes) found in the trace."""
+    if not stderr_path.is_file():
+        return 0
+    text = ANSI_RE.sub("", stderr_path.read_text(encoding="utf-8", errors="replace"))
+    peak = 0
+    for match in PEAK_MEMORY_RE.finditer(text):
+        number = float(match.group(1))
+        suffix = match.group(2) or " "
+        bytes_value = int(number * 10 ** ((SI_SUFFIXES.index(suffix) - SI_BASE_INDEX) * 3))
+        peak = max(peak, bytes_value)
+    return peak
+
+
+def parse_time_file(time_path: Path) -> tuple[float, int]:
+    """Read `/usr/bin/time -f '%e %M'` output: (wall_seconds, max_rss_kb).
+
+    Returns (0.0, 0) if the file is missing or unparseable.
+    """
+    if not time_path.is_file():
+        return 0.0, 0
+    raw = time_path.read_text(encoding="utf-8", errors="replace").strip().splitlines()
+    if not raw:
+        return 0.0, 0
+    parts = raw[-1].split()
+    if len(parts) < 2:
+        return 0.0, 0
+    try:
+        return float(parts[0]), int(parts[1])
+    except ValueError:
+        return 0.0, 0
+
+
+def read_meta(meta_path: Path) -> dict[str, str]:
+    out: dict[str, str] = {}
+    if not meta_path.is_file():
+        return out
+    for line in meta_path.read_text(encoding="utf-8").splitlines():
+        if "=" in line:
+            key, _, val = line.partition("=")
+            out[key.strip()] = val.strip()
+    return out
+
+
+def parse_runs(bench_dir: Path, circuit: str) -> str:
+    circuit_dir = bench_dir / "per_circuit" / circuit
+    meta = read_meta(circuit_dir / "meta.txt")
+
+    prove_runs: list[tuple[float, int, int]] = []
+    verify_runs: list[tuple[float, int]] = []
+
+    i = 1
+    while True:
+        time_path = circuit_dir / f"prove_{i}.time"
+        if not time_path.is_file():
+            break
+        wall, rss_kb = parse_time_file(time_path)
+        heap_bytes = parse_peak_heap_bytes(circuit_dir / f"prove_{i}.stderr")
+        prove_runs.append((wall, rss_kb, heap_bytes))
+        i += 1
+
+    j = 1
+    while True:
+        time_path = circuit_dir / f"verify_{j}.time"
+        if not time_path.is_file():
+            break
+        wall, _rss = parse_time_file(time_path)
+        verify_runs.append((wall, _rss))
+        j += 1
+
+    if not prove_runs:
+        return ""
+
+    prove_time_ms = mean(r[0] for r in prove_runs) * 1000.0
+    prover_rss_kb = mean(r[1] for r in prove_runs)
+    prover_heap_bytes = mean(r[2] for r in prove_runs)
+    verifier_time_ms = mean(r[0] for r in verify_runs) * 1000.0 if verify_runs else 0.0
+
+    pkp_size = meta.get("pkp_size_bytes", "0")
+    proof_size = meta.get("proof_size_bytes", "0")
+
+    return ",".join(
+        [
+            circuit,
+            f"{prove_time_ms:.1f}",
+            f"{prover_rss_kb:.0f}",
+            f"{prover_heap_bytes:.0f}",
+            f"{verifier_time_ms:.1f}",
+            proof_size,
+            pkp_size,
+            str(len(prove_runs)),
+        ]
+    )
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    sub = parser.add_subparsers(dest="cmd", required=True)
+
+    p = sub.add_parser("parse-runs")
+    p.add_argument("bench_dir", type=Path)
+    p.add_argument("circuit")
+
+    p = sub.add_parser("human-to-bytes")
+    p.add_argument("value")
+
+    args = parser.parse_args()
+
+    if args.cmd == "parse-runs":
+        row = parse_runs(args.bench_dir, args.circuit)
+        if row:
+            print(row)
+    elif args.cmd == "human-to-bytes":
+        print(human_to_bytes(args.value))
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/scripts/run_csp_benchmarks.sh b/scripts/run_csp_benchmarks.sh
new file mode 100755
index 000000000..e7382a34a
--- /dev/null
+++ b/scripts/run_csp_benchmarks.sh
@@ -0,0 +1,234 @@
+#!/usr/bin/env bash
+# run_csp_benchmarks.sh
+#
+# Run prove/verify benchmarks for noir-examples/csp-benchmarks/*. Each circuit
+# is compiled and prepared once, then prove + verify are each invoked
+# BENCH_RUNS times so the helper can average wall time, peak RSS, and
+# heap-peak bytes (parsed from the prover's tracing output).
+#
+# Environment variables (all optional):
+#   PROVEKIT_BIN     Path to provekit-cli (default: target/release/provekit-cli)
+#   BENCH_ROOT       Path to csp-benchmarks (default: noir-examples/csp-benchmarks)
+#   BENCH_DIR        Output directory (default: csp-bench-logs)
+#   BENCH_RUNS       Iterations to average (default: 3)
+#   TEST_FILTER      Regex on circuit name
+#   MAX_TESTS        Cap on circuits (0 = unlimited)
+#
+# Output: BENCH_DIR/results.csv with one row per circuit:
+#   circuit,prover_time_ms,prover_peak_rss_kb,prover_heap_peak_bytes,
+#     verifier_time_ms,proof_size_bytes,pkp_size_bytes,runs
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+HELPER="${SCRIPT_DIR}/csp_benchmark_helpers.py"
+
+PROVEKIT_BIN="${PROVEKIT_BIN:-${REPO_ROOT}/target/release/provekit-cli}"
+BENCH_ROOT="${BENCH_ROOT:-${REPO_ROOT}/noir-examples/csp-benchmarks}"
+BENCH_DIR="${BENCH_DIR:-${REPO_ROOT}/csp-bench-logs}"
+BENCH_RUNS="${BENCH_RUNS:-3}"
+TEST_FILTER="${TEST_FILTER:-}"
+MAX_TESTS="${MAX_TESTS:-0}"
+
+if [[ "${BENCH_DIR}" != /* ]]; then
+  BENCH_DIR="${REPO_ROOT}/${BENCH_DIR}"
+fi
+
+if [[ ! -x "${PROVEKIT_BIN}" ]]; then
+  echo "ERROR: provekit-cli binary not found at ${PROVEKIT_BIN}" >&2
+  echo "Build it first: cargo build --release --bin provekit-cli" >&2
+  exit 1
+fi
+
+if [[ ! -d "${BENCH_ROOT}" ]]; then
+  echo "ERROR: csp-benchmarks not found at ${BENCH_ROOT}" >&2
+  exit 1
+fi
+
+if ! command -v nargo >/dev/null 2>&1; then
+  echo "ERROR: nargo is required but not in PATH" >&2
+  exit 1
+fi
+
+if ! python3 -c "import tomllib" 2>/dev/null; then
+  echo "ERROR: python3.11+ is required (tomllib not found)." >&2
+  echo "Current: $(python3 --version 2>&1)" >&2
+  exit 1
+fi
+
+# `/usr/bin/time` is the GNU-style binary; macOS ships a different `time` shell
+# builtin so users may need `gtime` from `brew install gnu-time`. CI runs on
+# ubuntu-24.04-arm where /usr/bin/time is GNU.
+TIME_BIN=""
+if [[ -x /usr/bin/time ]]; then
+  TIME_BIN=/usr/bin/time
+elif command -v gtime >/dev/null 2>&1; then
+  TIME_BIN="$(command -v gtime)"
+else
+  echo "ERROR: GNU /usr/bin/time not found (try: brew install gnu-time)" >&2
+  exit 1
+fi
+
+mkdir -p "${BENCH_DIR}/per_circuit"
+RESULTS_CSV="${BENCH_DIR}/results.csv"
+echo "circuit,prover_time_ms,prover_peak_rss_kb,prover_heap_peak_bytes,verifier_time_ms,proof_size_bytes,pkp_size_bytes,runs" > "${RESULTS_CSV}"
+
+shopt -s nullglob
+
+# Discover circuits: any direct subdir of csp-benchmarks/ that has both a
+# Nargo.toml and a Prover.toml at its root. This filters out keccak_lib/.
+discover_circuits() {
+  for dir in "${BENCH_ROOT}"/*/; do
+    if [[ -f "${dir}Nargo.toml" && -f "${dir}Prover.toml" ]]; then
+      basename "${dir%/}"
+    fi
+  done
+}
+
+mapfile -t circuits < <(discover_circuits | sort)
+if [[ "${#circuits[@]}" -eq 0 ]]; then
+  echo "ERROR: no circuits discovered under ${BENCH_ROOT}" >&2
+  exit 1
+fi
+
+echo "Discovered ${#circuits[@]} circuits"
+
+# Read [package].name from a Nargo.toml; fall back to directory basename.
+read_package_name() {
+  local dir="$1"
+  python3 - "$dir" <<'PY'
+import sys, tomllib, pathlib
+nargo = pathlib.Path(sys.argv[1]) / "Nargo.toml"
+try:
+    data = tomllib.loads(nargo.read_text())
+    print(data.get("package", {}).get("name", ""))
+except Exception:
+    pass
+PY
+}
+
+attempted=0
+succeeded=0
+failed=0
+
+for circuit in "${circuits[@]}"; do
+  if [[ -n "${TEST_FILTER}" && ! "${circuit}" =~ ${TEST_FILTER} ]]; then
+    continue
+  fi
+  (( attempted += 1 ))
+  if [[ "${MAX_TESTS}" -gt 0 && "${attempted}" -gt "${MAX_TESTS}" ]]; then
+    break
+  fi
+
+  workdir="${BENCH_ROOT}/${circuit}"
+  out_dir="${BENCH_DIR}/per_circuit/${circuit}"
+  mkdir -p "${out_dir}"
+
+  echo ""
+  echo "==> [${attempted}/${#circuits[@]}] ${circuit}"
+
+  pkg_name="$(read_package_name "${workdir}")"
+  if [[ -z "${pkg_name}" ]]; then
+    pkg_name="${circuit}"
+  fi
+
+  # 1) compile
+  if ! (cd "${workdir}" && nargo compile > "${out_dir}/compile.log" 2>&1); then
+    echo "FAIL: nargo compile (${circuit})"
+    (( failed += 1 ))
+    continue
+  fi
+
+  circuit_json="${workdir}/target/${pkg_name}.json"
+  if [[ ! -f "${circuit_json}" ]]; then
+    # Fallback: pick the first json under target/.
+    candidate=("${workdir}"/target/*.json)
+    if [[ "${#candidate[@]}" -gt 0 ]]; then
+      circuit_json="${candidate[0]}"
+    else
+      echo "FAIL: no compiled JSON in ${workdir}/target/"
+      (( failed += 1 ))
+      continue
+    fi
+  fi
+
+  pkp_path="${out_dir}/prover.pkp"
+  pkv_path="${out_dir}/verifier.pkv"
+  proof_path="${out_dir}/proof.np"
+
+  # 2) prepare
+  if ! (cd "${workdir}" && "${PROVEKIT_BIN}" prepare "${circuit_json}" \
+        --pkp "${pkp_path}" --pkv "${pkv_path}") > "${out_dir}/prepare.log" 2>&1; then
+    echo "FAIL: provekit-cli prepare (${circuit})"
+    (( failed += 1 ))
+    continue
+  fi
+
+  pkp_size_bytes="$(stat -c '%s' "${pkp_path}" 2>/dev/null || stat -f '%z' "${pkp_path}")"
+
+  # 3) prove × BENCH_RUNS — write each run's stderr separately so the helper
+  #    can parse the tracing output's "peak memory" lines.
+  prove_ok=1
+  for ((i=1; i<=BENCH_RUNS; i++)); do
+    if ! (cd "${workdir}" && "${TIME_BIN}" -f '%e %M' \
+            -o "${out_dir}/prove_${i}.time" \
+            "${PROVEKIT_BIN}" prove "${pkp_path}" "${workdir}/Prover.toml" \
+            -o "${proof_path}") 2> "${out_dir}/prove_${i}.stderr"; then
+      echo "FAIL: provekit-cli prove run ${i} (${circuit})"
+      prove_ok=0
+      break
+    fi
+  done
+  if [[ "${prove_ok}" -ne 1 ]]; then
+    (( failed += 1 ))
+    continue
+  fi
+
+  proof_size_bytes="$(stat -c '%s' "${proof_path}" 2>/dev/null || stat -f '%z' "${proof_path}")"
+
+  # 4) verify × BENCH_RUNS
+  verify_ok=1
+  for ((i=1; i<=BENCH_RUNS; i++)); do
+    if ! (cd "${workdir}" && "${TIME_BIN}" -f '%e %M' \
+            -o "${out_dir}/verify_${i}.time" \
+            "${PROVEKIT_BIN}" verify "${pkv_path}" "${proof_path}") \
+            2> "${out_dir}/verify_${i}.stderr"; then
+      echo "FAIL: provekit-cli verify run ${i} (${circuit})"
+      verify_ok=0
+      break
+    fi
+  done
+  if [[ "${verify_ok}" -ne 1 ]]; then
+    (( failed += 1 ))
+    continue
+  fi
+
+  cat > "${out_dir}/meta.txt" <<EOF
+pkp_size_bytes=${pkp_size_bytes}
+proof_size_bytes=${proof_size_bytes}
+EOF
+
+  row="$(python3 "${HELPER}" parse-runs "${BENCH_DIR}" "${circuit}")"
+  if [[ -n "${row}" ]]; then
+    echo "${row}" >> "${RESULTS_CSV}"
+    echo "OK: ${row}"
+    (( succeeded += 1 ))
+  else
+    echo "FAIL: helper produced no row for ${circuit}"
+    (( failed += 1 ))
+  fi
+done
+
+echo ""
+echo "----- csp-benchmarks summary -----"
+echo "Discovered : ${#circuits[@]}"
+echo "Attempted  : ${attempted}"
+echo "Succeeded  : ${succeeded}"
+echo "Failed     : ${failed}"
+echo "Results    : ${RESULTS_CSV}"
+
+if [[ "${failed}" -gt 0 ]]; then
+  exit 1
+fi
+exit 0

From 40fd266de1af4c64ac8da12a6ad623f7afc0c995 Mon Sep 17 00:00:00 2001
From: Aditya Bisht <adityabisht64@gmail.com>
Date: Tue, 28 Apr 2026 14:54:31 +0530
Subject: [PATCH 12/16] fix(ci): unblock noir-execution-success suite and fix
 typo

- scripts/csp_benchmark_helpers.py: spell "unparsable" the way crate-ci/typos
  expects, fixing the Spell check job that was failing on this PR.
- scripts/run_noir_execution_success.sh: don't exit 1 when only some
  circuits fail. The PR sticky comment surfaces failing circuits, so the
  workflow shouldn't block PRs on known provekit-cli limitations. Local
  callers that want the old behaviour can opt back in via STRICT_FAIL=1.
---
 scripts/csp_benchmark_helpers.py      | 2 +-
 scripts/run_noir_execution_success.sh | 7 ++++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/scripts/csp_benchmark_helpers.py b/scripts/csp_benchmark_helpers.py
index 5333ece79..f03b85797 100755
--- a/scripts/csp_benchmark_helpers.py
+++ b/scripts/csp_benchmark_helpers.py
@@ -86,7 +86,7 @@ def parse_peak_heap_bytes(stderr_path: Path) -> int:
 def parse_time_file(time_path: Path) -> tuple[float, int]:
     """Read `/usr/bin/time -f '%e %M'` output: (wall_seconds, max_rss_kb).
 
-    Returns (0.0, 0) if the file is missing or unparseable.
+    Returns (0.0, 0) if the file is missing or unparsable.
     """
     if not time_path.is_file():
         return 0.0, 0
diff --git a/scripts/run_noir_execution_success.sh b/scripts/run_noir_execution_success.sh
index 50848f466..c7a899768 100755
--- a/scripts/run_noir_execution_success.sh
+++ b/scripts/run_noir_execution_success.sh
@@ -420,7 +420,12 @@ if [[ -f "${WITNESS_CSV}" ]] && python3 "${SCRIPT_DIR}/generate_provekit_witness
   fi
 fi
 
-if [[ "${failed}" -gt 0 ]]; then
+# Circuit failures are surfaced via the PR sticky comment and the grouped
+# error report. The workflow should not fail just because some circuits
+# don't compile through provekit-cli today — the report is the source of
+# truth for which circuits pass. Set STRICT_FAIL=1 to opt into the old
+# "exit 1 on any failure" behaviour for local CI gates.
+if [[ "${STRICT_FAIL:-0}" == "1" && "${failed}" -gt 0 ]]; then
   exit 1
 fi
 

From fb27ab516b14058b4b017c641f92880c3d5378d3 Mon Sep 17 00:00:00 2001
From: Aditya Bisht <adityabisht64@gmail.com>
Date: Tue, 28 Apr 2026 14:54:40 +0530
Subject: [PATCH 13/16] feat(bench): show deltas vs. main baseline in CSP PR
 comment

- build_csp_pr_comment.py: drop the "Runs" column (the iteration count
  is already shown in the metric metadata table) and accept an optional
  --baseline-csv. When present, each metric cell shows a percentage
  delta against the latest successful CSP-benchmarks run on main.
  Circuits absent from the baseline are flagged "(new)".
- csp-benchmarks.yml: also run on push to main (so each main commit
  publishes a new baseline artifact) and add a best-effort step that
  uses gh to look up the latest successful main run, download its
  results.csv into csp-bench-logs/baseline/, and pass it through to
  the renderer along with the baseline run id for the comment link.
- Add actions:read so the workflow can list runs and download
  artifacts from previous runs.
---
 .github/scripts/build_csp_pr_comment.py | 160 ++++++++++++++++++++----
 .github/workflows/csp-benchmarks.yml    |  39 +++++-
 2 files changed, 177 insertions(+), 22 deletions(-)

diff --git a/.github/scripts/build_csp_pr_comment.py b/.github/scripts/build_csp_pr_comment.py
index c291080db..bff8ce8cb 100644
--- a/.github/scripts/build_csp_pr_comment.py
+++ b/.github/scripts/build_csp_pr_comment.py
@@ -2,7 +2,9 @@
 """Build a sticky PR comment for the CSP benchmarks workflow.
 
 Reads the CSV emitted by ``scripts/run_csp_benchmarks.sh`` (one row per
-circuit) and renders it as a markdown table with human-readable units.
+circuit) and renders it as a markdown table with human-readable units. If
+``--baseline-csv`` is given, each metric cell appends a percentage delta
+versus the baseline value (last successful CSP-benchmarks run on main).
 """
 
 from __future__ import annotations
@@ -14,6 +16,16 @@
 MARKER = "<!-- csp-benchmarks-report -->"
 MAX_COMMENT_CHARS = 62000
 
+# Metric columns we render with a delta. Order matches the table header.
+METRIC_COLUMNS: tuple[tuple[str, str], ...] = (
+    ("prover_time_ms", "ms"),
+    ("prover_peak_rss_kb", "kb"),
+    ("prover_heap_peak_bytes", "bytes"),
+    ("verifier_time_ms", "ms"),
+    ("proof_size_bytes", "bytes"),
+    ("pkp_size_bytes", "bytes"),
+)
+
 
 def fmt_bytes(value: float) -> str:
     if value <= 0:
@@ -40,6 +52,37 @@ def fmt_ms(ms: float) -> str:
     return f"{ms / 1000.0:.2f} s"
 
 
+def fmt_value(unit: str, value: float) -> str:
+    if unit == "ms":
+        return fmt_ms(value)
+    if unit == "kb":
+        return fmt_kb_to_bytes(value)
+    return fmt_bytes(value)
+
+
+def fmt_delta(current: float, baseline: float | None) -> str:
+    """Return a compact delta-vs-baseline annotation, or empty string.
+
+    - Returns "" when no baseline is available.
+    - Returns "(new)" when current is present but baseline is missing
+      for this circuit.
+    - Returns "(±0.0%)" / "(+1.2%)" / "(-3.4%)" otherwise.
+    """
+    if baseline is None:
+        return ""
+    if baseline <= 0:
+        # Baseline collected zero (e.g., older CSV without this metric).
+        # Don't show a misleading divide-by-zero ratio.
+        return ""
+    if current <= 0:
+        return ""
+    delta_pct = (current - baseline) / baseline * 100.0
+    if abs(delta_pct) < 0.05:
+        return " (±0.0%)"
+    sign = "+" if delta_pct > 0 else ""
+    return f" ({sign}{delta_pct:.1f}%)"
+
+
 def status_with_icon(status: str) -> str:
     normalized = (status or "unknown").strip().lower()
     labels = {
@@ -58,38 +101,69 @@ def read_rows(csv_path: Path) -> list[dict[str, str]]:
         return list(csv.DictReader(f))
 
 
-def render_table(rows: list[dict[str, str]]) -> str:
+def index_baseline(rows: list[dict[str, str]]) -> dict[str, dict[str, float]]:
+    """Index baseline rows by circuit name with float metric values."""
+    out: dict[str, dict[str, float]] = {}
+    for row in rows:
+        circuit = (row.get("circuit") or "").strip()
+        if not circuit:
+            continue
+        metrics: dict[str, float] = {}
+        for metric, _unit in METRIC_COLUMNS:
+            try:
+                metrics[metric] = float(row.get(metric) or 0)
+            except ValueError:
+                metrics[metric] = 0.0
+        out[circuit] = metrics
+    return out
+
+
+def render_table(
+    rows: list[dict[str, str]],
+    baseline: dict[str, dict[str, float]],
+    has_baseline_file: bool,
+) -> str:
     if not rows:
         return "_No benchmark results were produced._"
 
     header = (
         "| Circuit | Prover time | Peak RSS | Peak heap | Verifier time | "
-        "Proof size | PKP size | Runs |"
+        "Proof size | PKP size |"
     )
-    sep = "|---|---:|---:|---:|---:|---:|---:|---:|"
+    sep = "|---|---:|---:|---:|---:|---:|---:|"
     lines = [header, sep]
+
     for row in sorted(rows, key=lambda r: r.get("circuit", "")):
-        lines.append(
-            "| "
-            + " | ".join(
-                [
-                    f"`{row['circuit']}`",
-                    fmt_ms(float(row.get("prover_time_ms", 0) or 0)),
-                    fmt_kb_to_bytes(float(row.get("prover_peak_rss_kb", 0) or 0)),
-                    fmt_bytes(float(row.get("prover_heap_peak_bytes", 0) or 0)),
-                    fmt_ms(float(row.get("verifier_time_ms", 0) or 0)),
-                    fmt_bytes(float(row.get("proof_size_bytes", 0) or 0)),
-                    fmt_bytes(float(row.get("pkp_size_bytes", 0) or 0)),
-                    row.get("runs", "—"),
-                ]
-            )
-            + " |"
-        )
+        circuit = row.get("circuit", "")
+        baseline_metrics = baseline.get(circuit)
+
+        cells = [f"`{circuit}`"]
+        for metric, unit in METRIC_COLUMNS:
+            try:
+                value = float(row.get(metric) or 0)
+            except ValueError:
+                value = 0.0
+
+            value_str = fmt_value(unit, value)
+
+            if has_baseline_file and value_str != "—":
+                if baseline_metrics is None:
+                    delta = " (new)"
+                else:
+                    delta = fmt_delta(value, baseline_metrics.get(metric))
+                cells.append(f"{value_str}{delta}")
+            else:
+                cells.append(value_str)
+        lines.append("| " + " | ".join(cells) + " |")
+
     return "\n".join(lines)
 
 
 def compose_comment(
     rows: list[dict[str, str]],
+    baseline: dict[str, dict[str, float]],
+    baseline_run_id: str,
+    has_baseline_file: bool,
     run_id: str,
     run_url: str,
     sha: str,
@@ -97,7 +171,28 @@ def compose_comment(
     runs_per_circuit: str,
 ) -> str:
     short_sha = sha[:12] if sha else "unknown"
-    table = render_table(rows)
+    table = render_table(rows, baseline, has_baseline_file)
+
+    if has_baseline_file:
+        if baseline_run_id:
+            baseline_note = (
+                f"Each metric cell shows the current value followed by the "
+                f"percentage delta against the latest successful "
+                f"[`main` run #{baseline_run_id}](https://github.com/worldfnd/provekit/actions/runs/{baseline_run_id}). "
+                f"`(new)` marks circuits absent from the baseline."
+            )
+        else:
+            baseline_note = (
+                "Each metric cell shows the current value followed by the "
+                "percentage delta against the latest successful `main` run. "
+                "`(new)` marks circuits absent from the baseline."
+            )
+    else:
+        baseline_note = (
+            "_No baseline available yet — deltas will appear once this "
+            "workflow has produced at least one successful `main` run._"
+        )
+
     lines = [
         MARKER,
         "## CSP benchmarks",
@@ -115,6 +210,8 @@ def compose_comment(
         "`peak memory` entry in `provekit-cli prove`'s tracing output; peak RSS "
         "is reported by `/usr/bin/time -v` (max-resident-set-size).",
         "",
+        baseline_note,
+        "",
         "<details open>",
         "<summary>Results</summary>",
         "",
@@ -131,6 +228,17 @@ def compose_comment(
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(description=__doc__)
     parser.add_argument("--results-csv", required=True, type=Path)
+    parser.add_argument(
+        "--baseline-csv",
+        type=Path,
+        default=None,
+        help="Optional CSV from the latest successful main run.",
+    )
+    parser.add_argument(
+        "--baseline-run-id",
+        default="",
+        help="Optional Actions run id of the baseline (for the link in the comment).",
+    )
     parser.add_argument("--output", required=True, type=Path)
     parser.add_argument("--run-id", required=True)
     parser.add_argument("--run-url", required=True)
@@ -143,8 +251,18 @@ def parse_args() -> argparse.Namespace:
 def main() -> None:
     args = parse_args()
     rows = read_rows(args.results_csv)
+
+    has_baseline_file = bool(
+        args.baseline_csv and args.baseline_csv.is_file()
+    )
+    baseline_rows = read_rows(args.baseline_csv) if has_baseline_file else []
+    baseline = index_baseline(baseline_rows)
+
     body = compose_comment(
         rows=rows,
+        baseline=baseline,
+        baseline_run_id=args.baseline_run_id,
+        has_baseline_file=has_baseline_file,
         run_id=args.run_id,
         run_url=args.run_url,
         sha=args.sha,
diff --git a/.github/workflows/csp-benchmarks.yml b/.github/workflows/csp-benchmarks.yml
index 8dc95147e..3e17f6368 100644
--- a/.github/workflows/csp-benchmarks.yml
+++ b/.github/workflows/csp-benchmarks.yml
@@ -2,6 +2,8 @@ name: CSP Benchmarks
 
 on:
   pull_request:
+  push:
+    branches: [main]
   workflow_dispatch:
     inputs:
       bench_runs:
@@ -13,6 +15,9 @@ permissions:
   contents: read
   pull-requests: write
   issues: write
+  # Needed to read artifacts from previous successful main runs so we can
+  # render percentage deltas in the PR comment.
+  actions: read
 
 env:
   CARGO_TERM_COLOR: always
@@ -26,7 +31,8 @@ concurrency:
 jobs:
   csp-benchmarks:
     # Block fork PRs from running our heavy bench script on the runner.
-    if: ${{ github.event_name == 'workflow_dispatch' || github.event.pull_request.head.repo.full_name == github.repository }}
+    # Push to main always runs so the artifact becomes a baseline for PRs.
+    if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
     name: CSP benchmarks (avg over ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.bench_runs != '' && github.event.inputs.bench_runs || '3') || '3' }} runs)
     runs-on: ubuntu-24.04-arm
 
@@ -63,12 +69,43 @@ jobs:
           path: csp-bench-logs/
           retention-days: 7
 
+      - name: Fetch baseline from latest successful main run
+        if: always() && github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository
+        continue-on-error: true
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GH_REPO: ${{ github.repository }}
+        run: |
+          set -e
+          BASELINE_RUN_ID="$(gh run list \
+            --workflow csp-benchmarks.yml \
+            --branch main \
+            --status success \
+            --limit 1 \
+            --json databaseId \
+            --jq '.[0].databaseId // empty')"
+          if [[ -z "${BASELINE_RUN_ID}" ]]; then
+            echo "No successful main run found yet; deltas will not be shown."
+            exit 0
+          fi
+          echo "Baseline run id: ${BASELINE_RUN_ID}"
+          mkdir -p csp-bench-logs/baseline
+          if gh run download "${BASELINE_RUN_ID}" \
+              --name "csp-bench-logs-${BASELINE_RUN_ID}" \
+              --dir csp-bench-logs/baseline; then
+            echo "BASELINE_RUN_ID=${BASELINE_RUN_ID}" >> "$GITHUB_ENV"
+          else
+            echo "Baseline artifact not retrievable; deltas will not be shown."
+          fi
+
       - name: Build sticky PR comment body
         if: always() && github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository
         continue-on-error: true
         run: |
           python3 .github/scripts/build_csp_pr_comment.py \
             --results-csv "csp-bench-logs/results.csv" \
+            --baseline-csv "csp-bench-logs/baseline/results.csv" \
+            --baseline-run-id "${BASELINE_RUN_ID:-}" \
             --output "csp-bench-logs/pr_comment.md" \
             --run-id "${{ github.run_id }}" \
             --run-url "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" \

From c0b171ecbbc2c5a0c68d0dc520d5b2f054a84a6c Mon Sep 17 00:00:00 2001
From: Aditya Bisht <adityabisht64@gmail.com>
Date: Tue, 28 Apr 2026 15:37:56 +0530
Subject: [PATCH 14/16] ci(comments): drop grouped report block and auto-update
 footer

- build_noir_pr_comment.py: stop rendering the raw grouped_error_report.txt
  details block. The failing-circuits list and PASS/FAIL/SKIP counts are
  parsed from the report and shown directly, so the raw dump is redundant.
  Truncation infrastructure (clip_tail, build_with_truncation, the
  sanitize_code_fence helper) is removed since the comment is now small
  enough to fit comfortably; a single hard-cut guard remains.
- build_csp_pr_comment.py: remove the trailing "_This comment is
  automatically updated_" footer.
---
 .github/scripts/build_csp_pr_comment.py  |  2 -
 .github/scripts/build_noir_pr_comment.py | 90 ++----------------------
 2 files changed, 5 insertions(+), 87 deletions(-)

diff --git a/.github/scripts/build_csp_pr_comment.py b/.github/scripts/build_csp_pr_comment.py
index bff8ce8cb..2f23107cd 100644
--- a/.github/scripts/build_csp_pr_comment.py
+++ b/.github/scripts/build_csp_pr_comment.py
@@ -219,8 +219,6 @@ def compose_comment(
         "",
         "</details>",
         "",
-        "_This comment is automatically updated by the CSP Benchmarks workflow._",
-        "",
     ]
     return "\n".join(lines)
 
diff --git a/.github/scripts/build_noir_pr_comment.py b/.github/scripts/build_noir_pr_comment.py
index 5959ea185..f98bc7723 100644
--- a/.github/scripts/build_noir_pr_comment.py
+++ b/.github/scripts/build_noir_pr_comment.py
@@ -9,7 +9,6 @@
 
 MARKER = "<!-- noir-execution-success-report -->"
 MAX_COMMENT_CHARS = 62000
-MIN_SECTION_CHARS = 1500
 
 
 def read_report(path: Path, display_name: str) -> str:
@@ -71,13 +70,8 @@ def status_with_icon(status: str) -> str:
     return f"{labels.get(normalized, '[INFO]')} {normalized}"
 
 
-def sanitize_code_fence(text: str) -> str:
-    return text.replace("```", "``\\`")
-
-
 def compose_comment(
     grouped_report_text: str,
-    grouped_truncated: bool,
     run_id: str,
     run_url: str,
     sha: str,
@@ -87,12 +81,6 @@ def compose_comment(
     counts = parse_grouped_counts(grouped_report_text)
     short_sha = sha[:12] if sha else "unknown"
 
-    grouped_truncated_note = (
-        "\n_Grouped report truncated to fit GitHub comment size limits._\n"
-        if grouped_truncated
-        else ""
-    )
-
     failing_circuits = parse_failing_circuits(grouped_report_text)
     if failing_circuits:
         failing_body = "\n".join(f"- `{name}`" for name in failing_circuits)
@@ -122,83 +110,11 @@ def compose_comment(
         "",
         "</details>",
         "",
-        "<details>",
-        "<summary><code>grouped_error_report.txt</code></summary>",
-        "",
-        "```text",
-        sanitize_code_fence(grouped_report_text),
-        "```",
-        grouped_truncated_note,
-        "</details>",
-        "",
-        "_This comment is automatically updated by the Noir Execution Success workflow._",
-        "",
     ]
 
     return "\n".join(lines)
 
 
-def clip_tail(text: str, min_chars: int, excess: int, label: str) -> tuple[str, bool]:
-    if len(text) <= min_chars or excess <= 0:
-        return text, False
-
-    reduction = min(len(text) - min_chars, excess + 1024)
-    kept = text[: len(text) - reduction].rstrip()
-    omitted = len(text) - len(kept)
-    clipped = f"{kept}\n\n[... truncated {omitted} characters from {label} ...]"
-    return clipped, True
-
-
-def build_with_truncation(
-    grouped_report_text: str,
-    run_id: str,
-    run_url: str,
-    sha: str,
-    noir_ref: str,
-    status: str,
-) -> str:
-    grouped_work = grouped_report_text
-    grouped_truncated = False
-
-    for _ in range(128):
-        comment = compose_comment(
-            grouped_work,
-            grouped_truncated=grouped_truncated,
-            run_id=run_id,
-            run_url=run_url,
-            sha=sha,
-            noir_ref=noir_ref,
-            status=status,
-        )
-        if len(comment) <= MAX_COMMENT_CHARS:
-            return comment
-
-        excess = len(comment) - MAX_COMMENT_CHARS
-        grouped_work, grouped_changed = clip_tail(
-            grouped_work, MIN_SECTION_CHARS, excess, "grouped_error_report.txt"
-        )
-        grouped_truncated = grouped_truncated or grouped_changed
-        if grouped_changed:
-            continue
-
-        break
-
-    fallback = compose_comment(
-        grouped_work,
-        grouped_truncated=True,
-        run_id=run_id,
-        run_url=run_url,
-        sha=sha,
-        noir_ref=noir_ref,
-        status=status,
-    )
-    if len(fallback) <= MAX_COMMENT_CHARS:
-        return fallback
-
-    hard_cut = fallback[: MAX_COMMENT_CHARS - 120].rstrip()
-    return f"{hard_cut}\n\n_Comment truncated due to GitHub size limits._\n"
-
-
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(description=__doc__)
     parser.add_argument("--grouped-report", required=True, type=Path)
@@ -216,7 +132,7 @@ def main() -> None:
 
     grouped_report_text = read_report(args.grouped_report, "grouped_error_report.txt")
 
-    body = build_with_truncation(
+    body = compose_comment(
         grouped_report_text=grouped_report_text,
         run_id=args.run_id,
         run_url=args.run_url,
@@ -225,6 +141,10 @@ def main() -> None:
         status=args.status,
     )
 
+    if len(body) > MAX_COMMENT_CHARS:
+        cut = body[: MAX_COMMENT_CHARS - 80].rstrip()
+        body = f"{cut}\n\n_Comment truncated due to GitHub size limits._\n"
+
     args.output.parent.mkdir(parents=True, exist_ok=True)
     args.output.write_text(body, encoding="utf-8")
     print(f"Wrote PR comment body to {args.output} ({len(body)} chars)")

From f775db8068d0491f640e4df27ce8abecfb62be26 Mon Sep 17 00:00:00 2001
From: Aditya Bisht <adityabisht64@gmail.com>
Date: Tue, 28 Apr 2026 16:16:20 +0530
Subject: [PATCH 15/16] feat(bench): add constraints and witnesses columns to
 CSP PR comment

Reads the counts from `provekit-cli prove`'s `Read Noir proof scheme`
log line (already captured per run); no CLI changes required. Renders
both as integer columns with thousands separators and the same
delta-vs-main treatment as the perf metrics. Backwards-compatible with
older baseline CSVs lacking the columns.
---
 .github/scripts/build_csp_pr_comment.py | 16 +++++++++++++---
 scripts/csp_benchmark_helpers.py        | 22 ++++++++++++++++++++++
 scripts/run_csp_benchmarks.sh           |  7 ++++---
 3 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/.github/scripts/build_csp_pr_comment.py b/.github/scripts/build_csp_pr_comment.py
index 2f23107cd..31e9670f9 100644
--- a/.github/scripts/build_csp_pr_comment.py
+++ b/.github/scripts/build_csp_pr_comment.py
@@ -18,6 +18,8 @@
 
 # Metric columns we render with a delta. Order matches the table header.
 METRIC_COLUMNS: tuple[tuple[str, str], ...] = (
+    ("num_constraints", "int"),
+    ("num_witnesses", "int"),
     ("prover_time_ms", "ms"),
     ("prover_peak_rss_kb", "kb"),
     ("prover_heap_peak_bytes", "bytes"),
@@ -52,11 +54,19 @@ def fmt_ms(ms: float) -> str:
     return f"{ms / 1000.0:.2f} s"
 
 
+def fmt_int(value: float) -> str:
+    if value <= 0:
+        return "—"
+    return f"{int(round(value)):,}"
+
+
 def fmt_value(unit: str, value: float) -> str:
     if unit == "ms":
         return fmt_ms(value)
     if unit == "kb":
         return fmt_kb_to_bytes(value)
+    if unit == "int":
+        return fmt_int(value)
     return fmt_bytes(value)
 
 
@@ -127,10 +137,10 @@ def render_table(
         return "_No benchmark results were produced._"
 
     header = (
-        "| Circuit | Prover time | Peak RSS | Peak heap | Verifier time | "
-        "Proof size | PKP size |"
+        "| Circuit | Constraints | Witnesses | Prover time | Peak RSS | "
+        "Peak heap | Verifier time | Proof size | PKP size |"
     )
-    sep = "|---|---:|---:|---:|---:|---:|---:|"
+    sep = "|---|---:|---:|---:|---:|---:|---:|---:|---:|"
     lines = [header, sep]
 
     for row in sorted(rows, key=lambda r: r.get("circuit", "")):
diff --git a/scripts/csp_benchmark_helpers.py b/scripts/csp_benchmark_helpers.py
index f03b85797..3bd141a0f 100755
--- a/scripts/csp_benchmark_helpers.py
+++ b/scripts/csp_benchmark_helpers.py
@@ -45,6 +45,11 @@
     rf"([0-9]+(?:\.[0-9]+)?)[{NARROW_NBSP} ]?([qryzafpnμmkMGTPEZYRQ])?B"
     r"\s+peak\s+memory",
 )
+# Matches the `info!(constraints, witnesses, "Read Noir proof scheme")` line
+# emitted by `tooling/cli/src/cmd/prove.rs` on every prove invocation.
+SCHEME_SIZE_RE = re.compile(
+    r"Read Noir proof scheme\b.*?\bconstraints=(\d+)\b.*?\bwitnesses=(\d+)\b"
+)
 
 
 def human_to_bytes(value: str) -> int:
@@ -83,6 +88,17 @@ def parse_peak_heap_bytes(stderr_path: Path) -> int:
     return peak
 
 
+def parse_scheme_sizes(stderr_path: Path) -> tuple[int, int]:
+    """Return (num_constraints, num_witnesses) from a prove stderr; (0, 0) if absent."""
+    if not stderr_path.is_file():
+        return 0, 0
+    text = ANSI_RE.sub("", stderr_path.read_text(encoding="utf-8", errors="replace"))
+    match = SCHEME_SIZE_RE.search(text)
+    if not match:
+        return 0, 0
+    return int(match.group(1)), int(match.group(2))
+
+
 def parse_time_file(time_path: Path) -> tuple[float, int]:
     """Read `/usr/bin/time -f '%e %M'` output: (wall_seconds, max_rss_kb).
 
@@ -130,6 +146,10 @@ def parse_runs(bench_dir: Path, circuit: str) -> str:
         prove_runs.append((wall, rss_kb, heap_bytes))
         i += 1
 
+    # Constraint and witness counts are deterministic per circuit, so reading
+    # them from the first prove run is sufficient.
+    num_constraints, num_witnesses = parse_scheme_sizes(circuit_dir / "prove_1.stderr")
+
     j = 1
     while True:
         time_path = circuit_dir / f"verify_{j}.time"
@@ -153,6 +173,8 @@ def parse_runs(bench_dir: Path, circuit: str) -> str:
     return ",".join(
         [
             circuit,
+            str(num_constraints),
+            str(num_witnesses),
             f"{prove_time_ms:.1f}",
             f"{prover_rss_kb:.0f}",
             f"{prover_heap_bytes:.0f}",
diff --git a/scripts/run_csp_benchmarks.sh b/scripts/run_csp_benchmarks.sh
index e7382a34a..e099c636c 100755
--- a/scripts/run_csp_benchmarks.sh
+++ b/scripts/run_csp_benchmarks.sh
@@ -15,8 +15,9 @@
 #   MAX_TESTS        Cap on circuits (0 = unlimited)
 #
 # Output: BENCH_DIR/results.csv with one row per circuit:
-#   circuit,prover_time_ms,prover_peak_rss_kb,prover_heap_peak_bytes,
-#     verifier_time_ms,proof_size_bytes,pkp_size_bytes,runs
+#   circuit,num_constraints,num_witnesses,prover_time_ms,prover_peak_rss_kb,
+#     prover_heap_peak_bytes,verifier_time_ms,proof_size_bytes,pkp_size_bytes,
+#     runs
 
 set -euo pipefail
 
@@ -72,7 +73,7 @@ fi
 
 mkdir -p "${BENCH_DIR}/per_circuit"
 RESULTS_CSV="${BENCH_DIR}/results.csv"
-echo "circuit,prover_time_ms,prover_peak_rss_kb,prover_heap_peak_bytes,verifier_time_ms,proof_size_bytes,pkp_size_bytes,runs" > "${RESULTS_CSV}"
+echo "circuit,num_constraints,num_witnesses,prover_time_ms,prover_peak_rss_kb,prover_heap_peak_bytes,verifier_time_ms,proof_size_bytes,pkp_size_bytes,runs" > "${RESULTS_CSV}"
 
 shopt -s nullglob
 

From ffbdd9f29144f229e93ed6a24557e6930c894ff3 Mon Sep 17 00:00:00 2001
From: Aditya Bisht <adityabisht64@gmail.com>
Date: Tue, 28 Apr 2026 16:16:27 +0530
Subject: [PATCH 16/16] fix(ci): address noir-execution-success review feedback

- Deduplicate NOIR_REF: collapse the nested ternary in env to a single
  `||` fallback and have the job name read `${{ env.NOIR_REF }}` so
  the literal version string only needs to bump in two places.
- Fix attempted overcount: blackbox skips bump `skipped` without
  bumping `total`, so summing passed+failed+skipped double-counts
  them. Track `attempted=${total}` instead. Observed 332 PASS / 15
  FAIL / 20 SKIP run now reports 347 (was 367).
---
 .github/workflows/noir-execution-success.yml | 4 ++--
 scripts/run_noir_execution_success.sh        | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/noir-execution-success.yml b/.github/workflows/noir-execution-success.yml
index 6cf7a64f7..8aeafafbb 100644
--- a/.github/workflows/noir-execution-success.yml
+++ b/.github/workflows/noir-execution-success.yml
@@ -17,7 +17,7 @@ permissions:
 
 env:
   CARGO_TERM_COLOR: always
-  NOIR_REF: ${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.noir_ref != '' && github.event.inputs.noir_ref || 'v1.0.0-beta.19') || 'v1.0.0-beta.19' }}
+  NOIR_REF: ${{ github.event.inputs.noir_ref || 'v1.0.0-beta.19' }}
 
 # Cancel any in-progress run on the same branch when a new one is triggered.
 concurrency:
@@ -28,7 +28,7 @@ jobs:
   noir-execution-success:
     # Block fork PRs from executing arbitrary build scripts on the self-hosted runner.
     if: ${{ github.event_name == 'workflow_dispatch' || github.event.pull_request.head.repo.full_name == github.repository }}
-    name: Noir execution_success suite (${{ github.event_name == 'workflow_dispatch' && (github.event.inputs.noir_ref != '' && github.event.inputs.noir_ref || 'v1.0.0-beta.19') || 'v1.0.0-beta.19' }})
+    name: Noir execution_success suite (${{ env.NOIR_REF }})
     runs-on: ubuntu-24.04-arm
 
     steps:
diff --git a/scripts/run_noir_execution_success.sh b/scripts/run_noir_execution_success.sh
index c7a899768..514f46938 100755
--- a/scripts/run_noir_execution_success.sh
+++ b/scripts/run_noir_execution_success.sh
@@ -360,7 +360,9 @@ for test_name in "${test_dirs[@]}"; do
   rm -f "${test_log}"
 done
 
-attempted=$((passed + failed + skipped))
+# Blackbox skips bump `skipped` without bumping `total` (see the skip block
+# above), so summing passed+failed+skipped would double-count them.
+attempted=${total}
 
 echo ""
 echo "----- execution_success summary -----"