From 9d3524a8d420044c79db9aecad8d0ca7515ef592 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 23 Apr 2026 06:11:24 +0000 Subject: [PATCH 1/2] Initial plan From f3657f5c832cd208d8799534b3552516af876eca Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 23 Apr 2026 06:16:20 +0000 Subject: [PATCH 2/2] openevolve: wait for CI + run benchmark in CI; extract evaluate.sh Agent-Logs-Url: https://github.com/githubnext/tsessebe/sessions/4ffc84f5-3ff8-4a4a-a946-14eeae0ee263 Co-authored-by: mrjf <180956+mrjf@users.noreply.github.com> --- .../programs/tsb-perf-evolve/evaluate.sh | 41 +++++++ .autoloop/programs/tsb-perf-evolve/program.md | 39 +++---- .autoloop/strategies/openevolve/strategy.md | 48 +++++++- .github/workflows/ci.yml | 109 ++++++++++++++++++ 4 files changed, 216 insertions(+), 21 deletions(-) create mode 100755 .autoloop/programs/tsb-perf-evolve/evaluate.sh diff --git a/.autoloop/programs/tsb-perf-evolve/evaluate.sh b/.autoloop/programs/tsb-perf-evolve/evaluate.sh new file mode 100755 index 00000000..a79fde6b --- /dev/null +++ b/.autoloop/programs/tsb-perf-evolve/evaluate.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash +# Evaluator for the tsb-perf-evolve OpenEvolve program. +# +# Both the autoloop agent (Step 6 of the OpenEvolve playbook) and CI (the +# `benchmark` job in .github/workflows/ci.yml) invoke this script so they +# produce comparable fitness numbers from identical commands. +# +# Output: a single JSON line on stdout with one of these shapes +# {"fitness": , "tsb_mean_ms": , "pandas_mean_ms": } +# {"fitness": null, "rejected_reason": ""} +# +# Exit code is always 0 — failures are encoded in the JSON so callers can +# parse the result uniformly. Diagnostics go to stderr. + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)" + +cd "$REPO_ROOT" + +# 1. Validity — existing tests for sortValues must still pass. +if ! bun test tests/core/series.sortValues.test.ts >/tmp/perf-evolve-tests.log 2>&1; then + echo '{"fitness": null, "rejected_reason": "tests failed"}' + exit 0 +fi + +# 2. Benchmark — tsb side. +tsb_ms=$(bun run "$SCRIPT_DIR/code/benchmark.ts" \ + | python3 -c "import json,sys; print(json.load(sys.stdin)['mean_ms'])") + +# 3. Benchmark — pandas side. Skip gracefully if pandas isn't available. +if ! python3 -c 'import pandas' 2>/dev/null; then + pip3 install pandas --quiet 2>/dev/null || true +fi +pd_ms=$(python3 "$SCRIPT_DIR/code/benchmark.py" \ + | python3 -c "import json,sys; print(json.load(sys.stdin)['mean_ms'])") + +# 4. Fitness = ratio. Lower is better. +ratio=$(python3 -c "print(${tsb_ms} / ${pd_ms})") +echo "{\"fitness\": ${ratio}, \"tsb_mean_ms\": ${tsb_ms}, \"pandas_mean_ms\": ${pd_ms}}" diff --git a/.autoloop/programs/tsb-perf-evolve/program.md b/.autoloop/programs/tsb-perf-evolve/program.md index a28d1dc2..96e35119 100644 --- a/.autoloop/programs/tsb-perf-evolve/program.md +++ b/.autoloop/programs/tsb-perf-evolve/program.md @@ -55,26 +55,25 @@ Population state lives in the state file on the `memory/autoloop` branch under t ## Evaluation ```bash -set -euo pipefail - -# 1. Validity — existing tests for sortValues must still pass. -bun test tests/core/series.sortValues.test.ts >/tmp/perf-evolve-tests.log 2>&1 || { - echo '{"fitness": null, "rejected_reason": "tests failed"}' - exit 0 -} - -# 2. Benchmark — tsb side. -tsb_ms=$(bun run .autoloop/programs/tsb-perf-evolve/code/benchmark.ts | python3 -c "import json,sys; print(json.load(sys.stdin)['mean_ms'])") - -# 3. Benchmark — pandas side. Skip gracefully if pandas isn't available. -if ! python3 -c 'import pandas' 2>/dev/null; then - pip3 install pandas --quiet 2>/dev/null || true -fi -pd_ms=$(python3 .autoloop/programs/tsb-perf-evolve/code/benchmark.py | python3 -c "import json,sys; print(json.load(sys.stdin)['mean_ms'])") - -# 4. Fitness = ratio. Lower is better. -ratio=$(python3 -c "print(${tsb_ms} / ${pd_ms})") -echo "{\"fitness\": ${ratio}, \"tsb_mean_ms\": ${tsb_ms}, \"pandas_mean_ms\": ${pd_ms}}" +bash .autoloop/programs/tsb-perf-evolve/evaluate.sh +``` + +The actual evaluator lives in `evaluate.sh` next to this file so the autoloop +agent (Step 6 of the OpenEvolve playbook) and CI (the `benchmark` job in +`.github/workflows/ci.yml`) invoke the **exact same** command and produce +comparable fitness numbers. See that script for details. + +It runs the validity tests, then the tsb and pandas benchmarks, and prints a +single JSON line on stdout: + +```json +{"fitness": , "tsb_mean_ms": , "pandas_mean_ms": } +``` + +or, if validity failed: + +```json +{"fitness": null, "rejected_reason": "tests failed"} ``` The metric is `fitness` (= `tsb_mean_ms / pandas_mean_ms`). **Lower is better.** A value below `1.0` means tsb is now faster than pandas on this workload. diff --git a/.autoloop/strategies/openevolve/strategy.md b/.autoloop/strategies/openevolve/strategy.md index f33c16d4..f9681dda 100644 --- a/.autoloop/strategies/openevolve/strategy.md +++ b/.autoloop/strategies/openevolve/strategy.md @@ -75,6 +75,51 @@ Edit only the files listed in `program.md`'s Target section. The diff style for Run the evaluation command from `program.md`. Parse the metric. +The in-sandbox evaluation is a *cheap pre-filter only* — the agent sandbox often cannot install `bun`, run `python3 -c 'import pandas'`, or otherwise reproduce realistic conditions (the `releaseassets.githubusercontent.com` firewall block is the common culprit). A null/missing metric here is **not** grounds for rejecting the candidate; that decision is deferred to Step 6.5. + +### Step 6.5. Wait for CI + +Before recording the candidate in the population (Step 7) or posting *any* iteration comment on the program issue / PR, wait for CI on the pushed commit. CI is the authoritative source of both correctness (Test & Lint / Build / Validate Python Examples) and fitness (the `OpenEvolve benchmark` check, which runs `bash .autoloop/programs/{program-name}/evaluate.sh` on a real runner with `bun` + `python3` + `pandas` installed). + +This step extends — and ties into — the generic `Step 5a → 5b → 5c` flow described in the autoloop workflow. OpenEvolve's only added requirement is that you must reach Step 5c (or the budget-exhausted handler) **before** writing the iteration comment, never after a speculative push. + +```bash +# Resolve the PR — prefer the pre-step lookup, fall back to gh. +PR=$(jq -r '.existing_pr // empty' /tmp/gh-aw/autoloop.json 2>/dev/null || true) +if [ -z "$PR" ]; then + PR=$(gh pr list --head autoloop/{program-name} --json number -q '.[0].number') +fi + +# Block until every required check terminates (or the wall-clock cap fires). +gh pr checks "$PR" --watch --interval 30 --fail-fast || true + +# Determine an aggregate status. Same awk classifier as Step 5a in the +# generic autoloop playbook — keep them in sync. +status=$(gh pr checks "$PR" --json conclusion,state \ + -q '.[] | (.conclusion // .state // "")' \ + | awk ' + BEGIN { r = "success" } + /^(FAILURE|CANCELLED|TIMED_OUT|ACTION_REQUIRED|STARTUP_FAILURE|STALE)$/ { r = "failure" } + /^(PENDING|QUEUED|IN_PROGRESS|WAITING|REQUESTED)$/ { if (r == "success") r = "pending" } + END { print r }') + +# Read the fitness from the OpenEvolve benchmark check-run (created by the +# `benchmark` job in .github/workflows/ci.yml). Title format: `fitness=` +# or `fitness=null`. SHA = the HEAD of the PR after the latest push/fix. +SHA=$(gh pr view "$PR" --json headRefOid -q '.headRefOid') +fitness=$(gh api "repos/${GITHUB_REPOSITORY}/commits/${SHA}/check-runs" \ + --jq '.check_runs[] | select(.name == "OpenEvolve benchmark") | .output.title' \ + | sed -n 's/^fitness=//p' | head -n1) +``` + +Branch on `$status`: + +- **`success`** → record the candidate in the population with `fitness: ` from the check-run (or `fitness: null` only if the `OpenEvolve benchmark` check explicitly reported it that way — e.g., correctness held but the benchmark itself errored). Proceed to Step 7. The iteration comment is `✅ Accepted` with the real numeric fitness. +- **`failure`** → enter the fix-retry loop from the generic autoloop Step 5b (up to 5 attempts, no-progress guard, 60-min wall-clock cap). Do **not** post an "accepted" comment. On a successful fix, loop back through the `gh pr checks --watch` block above on the new HEAD. On exhausted budget, mark the candidate `status: error` in the population with `fitness: null` and `pause_reason: "ci-fix-exhausted: "`, and post a `❌ Rejected` (or `⚠️ Error`) iteration comment that links to the failing run. +- **`pending`** (the wall-clock cap fired before CI concluded) → don't post a speculative `⏳ Pending CI` comment. Record the candidate in the population with `fitness: null` and `status: pending-ci`, and leave a single reconciliation-pending comment on the PR/issue that the next iteration's Step 6.5 is allowed to overwrite when it reads the now-concluded status for this same SHA. + +In all three branches, the iteration comment posted to the program issue and PR must reflect *terminal* state — never `⏳ Pending CI` as a permanent label. Comments live forever; the pending placeholder is what produced the bug this step exists to fix. + ### Step 7. Update the population Regardless of whether the iteration is accepted or rejected at the branch level, the candidate has been tried and should be recorded in the population — the population is a memory of what's been explored, not just what's been kept. @@ -88,7 +133,8 @@ Append a new entry to the `## 🧬 Population` subsection in the state file usin Continue with the normal autoloop Step 5 (Accept or Reject → commit / discard, update state file's Machine State, Iteration History, Lessons Learned, etc.) as defined in the workflow. The only additional requirements from OpenEvolve are: -- The Iteration History entry must include `operator`, `parent_id(s)`, `island`, and `fitness` fields (in addition to the normal status/change/metric/notes). +- The Iteration History entry must include `operator`, `parent_id(s)`, `island`, and `fitness` fields (in addition to the normal status/change/metric/notes). The `fitness` value comes from the `OpenEvolve benchmark` check-run resolved in Step 6.5 — never from the in-sandbox Step 6 estimate. +- The iteration comment posted to the program issue and PR must use the terminal status from Step 6.5 (`✅ Accepted` / `❌ Rejected` / `⚠️ Error` / `⏸ Pending-CI` only when the wall-clock cap genuinely fired). Never post `⏳ Pending CI` as a final state — that placeholder is what Step 6.5 exists to eliminate. - Lessons Learned additions should be phrased as *transferable heuristics* about the problem space, not as reports of what this iteration did. (E.g. "Hex layouts dominate grid layouts above n=20" — not "Iteration 17 tried a hex layout.") ## Feature dimensions diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b5590d66..d2f24b1e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -11,6 +11,7 @@ on: permissions: contents: read + checks: write jobs: test: @@ -76,3 +77,111 @@ jobs: - name: Validate Python playground examples run: python scripts/validate-python-examples.py playground/ + + benchmark: + # Run the OpenEvolve benchmark for autoloop *-evolve PRs so the autoloop + # agent can read a real fitness number from CI (see .autoloop/strategies/ + # openevolve/strategy.md, Step 6.5). The sandbox the agent runs in cannot + # install bun reliably and so cannot measure fitness itself. + name: OpenEvolve benchmark + if: | + (github.event_name == 'pull_request' && startsWith(github.head_ref, 'autoloop/') && contains(github.head_ref, '-evolve')) + || (github.event_name == 'push' && startsWith(github.ref_name, 'autoloop/') && contains(github.ref_name, '-evolve')) + runs-on: ubuntu-latest + permissions: + contents: read + checks: write + steps: + - uses: actions/checkout@v4 + + - name: Setup Bun + uses: oven-sh/setup-bun@v2 + with: + bun-version: latest + + - name: Install dependencies + run: bun install + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install Python dependencies + run: pip install pandas numpy + + - name: Resolve program directory + id: program + run: | + # Resolve the program directory from the branch name: + # autoloop/ → .autoloop/programs// + BRANCH="${GITHUB_HEAD_REF:-${GITHUB_REF_NAME}}" + PROGRAM="${BRANCH#autoloop/}" + PROGRAM_DIR=".autoloop/programs/${PROGRAM}" + echo "program=${PROGRAM}" >> "$GITHUB_OUTPUT" + echo "program_dir=${PROGRAM_DIR}" >> "$GITHUB_OUTPUT" + if [ -x "${PROGRAM_DIR}/evaluate.sh" ]; then + echo "has_evaluator=true" >> "$GITHUB_OUTPUT" + else + echo "No evaluate.sh for program '${PROGRAM}' — skipping benchmark." >&2 + echo "has_evaluator=false" >> "$GITHUB_OUTPUT" + fi + + - name: Run OpenEvolve benchmark + id: bench + if: steps.program.outputs.has_evaluator == 'true' + run: | + PROGRAM_DIR="${{ steps.program.outputs.program_dir }}" + # evaluate.sh is contracted to always exit 0 and encode failures in + # the JSON, but we tolerate non-zero exits anyway and fall back to a + # null fitness so the check-run still gets created. + set +e + bash "${PROGRAM_DIR}/evaluate.sh" >/tmp/bench-result.json 2>/tmp/bench-stderr + rc=$? + set -e + if [ ! -s /tmp/bench-result.json ]; then + echo "{\"fitness\": null, \"rejected_reason\": \"evaluator produced no output (exit ${rc})\"}" \ + > /tmp/bench-result.json + fi + cat /tmp/bench-result.json + fitness=$(jq -r '.fitness // "null"' /tmp/bench-result.json) + echo "fitness=${fitness}" >> "$GITHUB_OUTPUT" + # Compact JSON for the check-run output below. + echo "result_json=$(jq -c . /tmp/bench-result.json)" >> "$GITHUB_OUTPUT" + + - name: Upload benchmark result + if: steps.program.outputs.has_evaluator == 'true' + uses: actions/upload-artifact@v4 + with: + name: benchmark-result + path: /tmp/bench-result.json + + - name: Attach fitness as check-run + if: steps.program.outputs.has_evaluator == 'true' + uses: actions/github-script@v7 + env: + FITNESS: ${{ steps.bench.outputs.fitness }} + RESULT_JSON: ${{ steps.bench.outputs.result_json }} + with: + script: | + const fitness = process.env.FITNESS; + let result; + try { + result = JSON.parse(process.env.RESULT_JSON); + } catch { + result = { raw: process.env.RESULT_JSON }; + } + const sha = context.payload.pull_request + ? context.payload.pull_request.head.sha + : context.sha; + await github.rest.checks.create({ + ...context.repo, + name: "OpenEvolve benchmark", + head_sha: sha, + status: "completed", + conclusion: fitness === "null" ? "neutral" : "success", + output: { + title: `fitness=${fitness}`, + summary: "```json\n" + JSON.stringify(result, null, 2) + "\n```", + }, + });