From 2904dae19bb04546f1522351c9d0a87424cb4c03 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 23 Apr 2026 21:17:46 +0000 Subject: [PATCH 1/2] Initial plan From cf2b4bb6c4cf0edc1cf647e11b5cb1c638175def Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 23 Apr 2026 21:23:44 +0000 Subject: [PATCH 2/2] Add end-to-end install integration test harness Agent-Logs-Url: https://github.com/githubnext/autoloop/sessions/df881075-c8de-46ce-8314-6945f02f4f05 Co-authored-by: mrjf <180956+mrjf@users.noreply.github.com> --- .../workflows/install-integration-test.yml | 54 +++ tests/install-integration/README.md | 67 ++++ tests/install-integration/prompt.md | 17 + tests/install-integration/run.sh | 352 ++++++++++++++++++ tests/install-integration/teardown.sh | 99 +++++ tests/install-integration/verify-phase1.sh | 82 ++++ tests/install-integration/verify-phase2.sh | 120 ++++++ 7 files changed, 791 insertions(+) create mode 100644 .github/workflows/install-integration-test.yml create mode 100644 tests/install-integration/README.md create mode 100644 tests/install-integration/prompt.md create mode 100755 tests/install-integration/run.sh create mode 100755 tests/install-integration/teardown.sh create mode 100755 tests/install-integration/verify-phase1.sh create mode 100755 tests/install-integration/verify-phase2.sh diff --git a/.github/workflows/install-integration-test.yml b/.github/workflows/install-integration-test.yml new file mode 100644 index 0000000..7b2190f --- /dev/null +++ b/.github/workflows/install-integration-test.yml @@ -0,0 +1,54 @@ +name: Install Integration Test + +# End-to-end test of install.md against a long-lived target repo. +# Manual-dispatch only -- this exercises real LLM calls and force-pushes a +# remote branch, so it must not run on PRs or schedules. + +on: + workflow_dispatch: + inputs: + keep_state_on_failure: + description: "Leave test repo in failure state for inspection" + type: boolean + default: false + install_test_repo: + description: "Target repo for the install (owner/repo)" + type: string + default: "mrjf/autoloop-test" + +jobs: + install-integration: + runs-on: ubuntu-latest + timeout-minutes: 30 + permissions: + contents: read + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install gh aw extension + env: + GH_TOKEN: ${{ secrets.INSTALL_TEST_TOKEN }} + run: gh extension install github/gh-aw + + - name: Install Copilot CLI + env: + GH_TOKEN: ${{ secrets.INSTALL_TEST_TOKEN }} + # The Copilot CLI is distributed as an npm package. If the install + # path changes upstream, update this single step. + run: npm install -g @github/copilot + + - name: Verify gh auth + env: + GH_TOKEN: ${{ secrets.INSTALL_TEST_TOKEN }} + run: gh auth status + + - name: Run integration test + env: + GH_TOKEN: ${{ secrets.INSTALL_TEST_TOKEN }} + INSTALL_TEST_REPO: ${{ inputs.install_test_repo }} + KEEP_STATE_ON_FAILURE: ${{ inputs.keep_state_on_failure && '1' || '0' }} + run: ./tests/install-integration/run.sh diff --git a/tests/install-integration/README.md b/tests/install-integration/README.md new file mode 100644 index 0000000..fa81662 --- /dev/null +++ b/tests/install-integration/README.md @@ -0,0 +1,67 @@ +# install-integration + +End-to-end integration test for [`install.md`](../../install.md). Runs the +install flow as a real coding agent (Copilot CLI) against a long-lived +target repo (`mrjf/autoloop-test` by default), then exercises Phase 2 by +running one iteration each of three programs across the program-source × +strategy matrix. + +This test is **manual-dispatch only**. It is not part of CI. + +## Local mode + +```bash +# from the autoloop repo root: +./tests/install-integration/run.sh +``` + +Requirements: + +- `gh` CLI authenticated as a user with write access to the target repo. +- `copilot` CLI on PATH. +- `python3` and `git` on PATH. + +Optional env / flags: + +- `INSTALL_TEST_REPO=/` -- override the target (default + `mrjf/autoloop-test`). +- `--keep` (or `KEEP_STATE_ON_FAILURE=1`) -- skip teardown on failure so + the failure state can be inspected. Run `teardown.sh ` + manually afterwards. + +## Actions mode + +Trigger the **Install Integration Test** workflow from the Actions tab. It +runs the same script on a GitHub-hosted runner. Requires the +`INSTALL_TEST_TOKEN` repo secret -- a PAT with `repo` scope on the target +repo (the default `GITHUB_TOKEN` has no access to repos outside the host). + +## What it tests + +See [the issue that introduced this harness](https://github.com/githubnext/autoloop/issues) +for the full motivation. In short: + +- **Phase 1** (file presence + lock idempotency) -- catches regressions in + `install.md` and in `gh aw compile`. +- **Phase 2** (3 programs × 1 iteration each) -- catches regressions in + the scheduler, in strategy discovery, and in the iteration loop. The + three programs cover: + + | # | Source | Strategy | + |---|------------|-----------------| + | 1 | file-based | OpenEvolve | + | 2 | issue-based| Test-Driven | + | 3 | file-based | plain (default) | + +- **Phase 3** (teardown) -- resets the target repo to the captured base + SHA, closes test issues/PRs, and deletes test branches. + +## Files + +| File | Purpose | +|-----------------------|--------------------------------------------------| +| `run.sh` | Driver. Orchestrates phases 1-3. | +| `prompt.md` | Prompt fed to Copilot CLI (edit without touching the driver). | +| `verify-phase1.sh` | File-presence + lock-idempotency assertions. | +| `verify-phase2.sh` | Per-program assertions (one call per program). | +| `teardown.sh` | Idempotent cleanup. Safe to re-run. | diff --git a/tests/install-integration/prompt.md b/tests/install-integration/prompt.md new file mode 100644 index 0000000..0a2017e --- /dev/null +++ b/tests/install-integration/prompt.md @@ -0,0 +1,17 @@ +You are installing autoloop into a freshly-reset GitHub repository. + +Your working directory is the root of that repository, cloned locally. The +repository is empty except for the base fixtures in `src/` and `tests/`. + +Follow the install instructions at the URL below, EXACTLY AS WRITTEN. Execute +each step using shell commands. Do not skip steps. Do not improvise. Do not +optimize or "improve" the instructions. + +Stop after Step 5 (the install PR is opened). Do NOT proceed to Step 6 +("Create Your First Program") -- the test harness handles program creation +itself in a deterministic way. + +When you finish: print a single line `INSTALL_PR=` with the URL of the +PR you opened in step 5. Then stop. + +Install instructions: https://github.com/githubnext/autoloop/blob/main/install.md diff --git a/tests/install-integration/run.sh b/tests/install-integration/run.sh new file mode 100755 index 0000000..f991085 --- /dev/null +++ b/tests/install-integration/run.sh @@ -0,0 +1,352 @@ +#!/usr/bin/env bash +# run.sh - end-to-end install integration test driver. +# +# Runs against $INSTALL_TEST_REPO (default: mrjf/autoloop-test). See the +# README in this directory for prerequisites and what the test verifies. +# +# Flags / env: +# --keep leave the test repo in failure state for inspection +# KEEP_STATE_ON_FAILURE=1 same as --keep (used by the Actions wrapper) +# INSTALL_TEST_REPO=... target repo (default: mrjf/autoloop-test) +# AUTOLOOP_REF=main git ref of install.md to follow (default: main) +# +# Exits non-zero on any failed assertion. Cleanup runs in `trap EXIT` so it +# happens even on abort. +set -euo pipefail + +# -------------------------------------------------------------------------- +# Args / env +# -------------------------------------------------------------------------- +KEEP="${KEEP_STATE_ON_FAILURE:-0}" +for arg in "$@"; do + case "$arg" in + --keep) KEEP=1 ;; + -h|--help) + sed -n '2,20p' "$0" + exit 0 + ;; + esac +done + +INSTALL_TEST_REPO="${INSTALL_TEST_REPO:-mrjf/autoloop-test}" +AUTOLOOP_REF="${AUTOLOOP_REF:-main}" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" + +log() { echo "[run.sh] $*"; } +hr() { echo "[run.sh] ----------------------------------------"; } + +PASS=0 +EXIT_CODE=1 +BASE_SHA="" +WORKDIR="" + +cleanup() { + local rc=$? + hr + if [ "$PASS" = "1" ]; then + log "test PASSED" + EXIT_CODE=0 + else + log "test FAILED (exit=$rc)" + EXIT_CODE=1 + fi + + if [ "$KEEP" = "1" ] && [ "$PASS" != "1" ]; then + log "KEEP=1 set: skipping teardown so failure state can be inspected" + log "remember to run teardown.sh manually:" + log " $SCRIPT_DIR/teardown.sh $INSTALL_TEST_REPO $BASE_SHA" + else + if [ -n "$BASE_SHA" ]; then + log "running teardown..." + "$SCRIPT_DIR/teardown.sh" "$INSTALL_TEST_REPO" "$BASE_SHA" || true + fi + fi + + if [ -n "$WORKDIR" ] && [ -d "$WORKDIR" ]; then + rm -rf "$WORKDIR" + fi + + exit "$EXIT_CODE" +} +trap cleanup EXIT + +# -------------------------------------------------------------------------- +# Pre-flight +# -------------------------------------------------------------------------- +hr +log "pre-flight" +command -v gh >/dev/null || { log "gh CLI not on PATH"; exit 1; } +command -v copilot >/dev/null || { log "copilot CLI not on PATH"; exit 1; } +command -v python3 >/dev/null || { log "python3 not on PATH"; exit 1; } +command -v git >/dev/null || { log "git not on PATH"; exit 1; } +gh auth status >/dev/null 2>&1 || { log "gh is not authenticated"; exit 1; } +log "tools ok; target repo: $INSTALL_TEST_REPO" + +# Detect whether the autoloop source has sync-branches.md so phase-1 +# verification can require its lock file (issue #52 may remove it). +if [ -f "$REPO_ROOT/workflows/sync-branches.md" ]; then + export EXPECT_SYNC_BRANCHES=1 + log "sync-branches.md present in source repo: phase-1 will require its lock file" +else + export EXPECT_SYNC_BRANCHES=0 +fi + +# -------------------------------------------------------------------------- +# Capture base-state SHA and reset target repo to it. +# -------------------------------------------------------------------------- +hr +log "capturing base-state SHA on $INSTALL_TEST_REPO@main" +BASE_SHA="$(gh api "repos/${INSTALL_TEST_REPO}/branches/main" --jq '.commit.sha')" +log "base SHA: $BASE_SHA" + +log "pre-test reset (discards any debris from prior failed runs)" +"$SCRIPT_DIR/teardown.sh" "$INSTALL_TEST_REPO" "$BASE_SHA" + +# -------------------------------------------------------------------------- +# Clone target locally and feed install.md to copilot. +# -------------------------------------------------------------------------- +WORKDIR="$(mktemp -d -t autoloop-install-int-XXXXXX)" +CHECKOUT="$WORKDIR/repo" +log "cloning $INSTALL_TEST_REPO -> $CHECKOUT" +git clone --quiet "https://github.com/${INSTALL_TEST_REPO}.git" "$CHECKOUT" + +hr +log "running copilot CLI against install.md (this can take several minutes)" +COPILOT_LOG="$WORKDIR/copilot.log" +PROMPT="$(cat "$SCRIPT_DIR/prompt.md")" +( + cd "$CHECKOUT" + # `--allow-all-tools` lets the agent run the shell commands install.md + # tells it to run; without that the test can't actually exercise the flow. + copilot --allow-all-tools -p "$PROMPT" 2>&1 | tee "$COPILOT_LOG" +) + +# Extract install PR URL from the agent's output. Tolerate quoting/extra ws. +INSTALL_PR="$(grep -Eo 'INSTALL_PR=https://github\.com/[^ ]+' "$COPILOT_LOG" \ + | tail -1 | cut -d= -f2- || true)" +if [ -z "$INSTALL_PR" ]; then + log "could not find INSTALL_PR=... in copilot output" + exit 1 +fi +export INSTALL_PR +log "install PR: $INSTALL_PR" + +# -------------------------------------------------------------------------- +# Phase 1 verification (against the install branch checkout). +# -------------------------------------------------------------------------- +hr +log "PHASE 1: verifying install artifacts" +# Make sure the local checkout is on the install branch the agent used. +HEAD_REF="$(gh pr view "$INSTALL_PR" --repo "$INSTALL_TEST_REPO" --json headRefName --jq '.headRefName')" +( + cd "$CHECKOUT" + git fetch --quiet origin "$HEAD_REF" + git checkout --quiet -B "$HEAD_REF" "origin/$HEAD_REF" +) +"$SCRIPT_DIR/verify-phase1.sh" "$CHECKOUT" + +# -------------------------------------------------------------------------- +# Merge the install PR and wait for it to land. +# -------------------------------------------------------------------------- +hr +log "merging install PR via squash" +gh pr merge "$INSTALL_PR" --repo "$INSTALL_TEST_REPO" --squash --admin --delete-branch +# Brief wait so subsequent gh api calls see the new main SHA. +sleep 5 + +# -------------------------------------------------------------------------- +# Phase 2: create three programs (file/openevolve, issue/test-driven, +# file/plain) and run one iteration of each, sequentially. +# -------------------------------------------------------------------------- +hr +log "PHASE 2: program × strategy matrix" + +# Refresh local checkout to merged main. +( + cd "$CHECKOUT" + git fetch --quiet origin main + git checkout --quiet -B main origin/main +) + +# Helper: commit a file-based program to main and push. +push_program() { + local name="$1" + local body_file="$2" + ( + cd "$CHECKOUT" + mkdir -p ".autoloop/programs/${name}" + cp "$body_file" ".autoloop/programs/${name}/program.md" + git add ".autoloop/programs/${name}/program.md" + git -c user.email=integration-test@autoloop -c user.name="autoloop integration test" \ + commit -m "test: add program ${name}" --quiet + git push --quiet origin main + ) +} + +# Helper: trigger one iteration for a program and echo the run id. +run_iteration() { + local name="$1" + log "dispatching autoloop.lock.yml for program=$name" + # Capture the timestamp just before dispatch so we can find the run we + # just kicked off. + local before + before="$(date -u +%Y-%m-%dT%H:%M:%SZ)" + gh workflow run autoloop.lock.yml --repo "$INSTALL_TEST_REPO" \ + -f program="$name" >/dev/null + + # Poll for the run. + local run_id="" attempts=0 + while [ -z "$run_id" ] && [ "$attempts" -lt 30 ]; do + sleep 5 + run_id="$(gh run list --repo "$INSTALL_TEST_REPO" --workflow autoloop.lock.yml \ + --created ">${before}" --limit 1 --json databaseId --jq '.[0].databaseId' 2>/dev/null || true)" + attempts=$((attempts + 1)) + done + [ -n "$run_id" ] || { log "could not find dispatched run for $name"; return 1; } + log "run id: $run_id; waiting for completion..." + gh run watch "$run_id" --repo "$INSTALL_TEST_REPO" --exit-status >/dev/null 2>&1 \ + || log "run $run_id finished with non-zero status (will be checked by verify-phase2.sh)" + echo "$run_id" +} + +# ---- Program 1: file-based + OpenEvolve ---------------------------------- +PROG1=rastrigin-openevolve +PROG1_FILE="$WORKDIR/${PROG1}.md" +cat > "$PROG1_FILE" <<'EOF' +--- +schedule: every 6h +--- + +# Rastrigin (OpenEvolve) + +## Goal + +Improve `src/minimize.py` to find lower minima of the Rastrigin function more +reliably. Lower metric is better. + +## Target + +Only modify these files: +- `src/minimize.py` -- the minimizer + +Do NOT modify: +- `src/evaluate.py` +- `tests/test_minimize.py` + +## Evaluation + +```bash +python3 src/evaluate.py +``` + +The metric is `metric`. **Lower is better.** + +## Evolution Strategy + +See `strategy/openevolve.md`. Maintain a small population across these islands: +- `grid-search` (baseline) +- `scipy-minimize` +- `gradient-descent` +EOF + +# ---- Program 3: file-based + plain prose (no strategy) ------------------- +# Defined here so we can push both file-based programs in one push later. +PROG3=rastrigin-plain +PROG3_FILE="$WORKDIR/${PROG3}.md" +cat > "$PROG3_FILE" <<'EOF' +--- +schedule: every 6h +--- + +# Rastrigin (Plain) + +## Goal + +Improve `src/minimize.py` to find lower minima of the Rastrigin function. Lower +metric is better. + +## Target + +Only modify these files: +- `src/minimize.py` + +Do NOT modify: +- `src/evaluate.py` +- `tests/test_minimize.py` + +## Evaluation + +```bash +python3 src/evaluate.py +``` + +The metric is `metric`. **Lower is better.** +EOF + +push_program "$PROG1" "$PROG1_FILE" +push_program "$PROG3" "$PROG3_FILE" + +# ---- Program 2: issue-based + Test-Driven -------------------------------- +PROG2=rastrigin-tdd +log "creating issue-based program $PROG2" +PROG2_BODY=$(cat <<'EOF' + + +--- +schedule: every 6h +--- + +# rastrigin-tdd + +## Goal + +Cover the public API of `src/minimize.py` with tighter correctness tests. +Higher passing test count is better. + +## Target + +Only modify these files: +- `tests/test_minimize.py` + +Do NOT modify: +- `src/minimize.py` +- `src/evaluate.py` + +## Evaluation + +```bash +python3 -m pytest tests/ -q | tail -1 +``` + +The metric is `passing_tests`. **Higher is better.** + +## Evolution Strategy + +See `strategy/test-driven.md`. Maintain a test harness with at least one +candidate per iteration. +EOF +) +gh issue create --repo "$INSTALL_TEST_REPO" \ + --title "[Autoloop: ${PROG2}]" \ + --label "autoloop-program" \ + --body "$PROG2_BODY" >/dev/null +log "issue created for $PROG2" + +# ---- Run the three programs sequentially --------------------------------- +RUN1="$(run_iteration "$PROG1")" +"$SCRIPT_DIR/verify-phase2.sh" "$INSTALL_TEST_REPO" "$PROG1" "$RUN1" "openevolve" + +RUN2="$(run_iteration "$PROG2")" +"$SCRIPT_DIR/verify-phase2.sh" "$INSTALL_TEST_REPO" "$PROG2" "$RUN2" "test-driven" + +RUN3="$(run_iteration "$PROG3")" +"$SCRIPT_DIR/verify-phase2.sh" "$INSTALL_TEST_REPO" "$PROG3" "$RUN3" "plain" + +# -------------------------------------------------------------------------- +# All assertions passed -- mark for the cleanup trap. +# -------------------------------------------------------------------------- +hr +log "all phases passed" +PASS=1 diff --git a/tests/install-integration/teardown.sh b/tests/install-integration/teardown.sh new file mode 100755 index 0000000..88ac2af --- /dev/null +++ b/tests/install-integration/teardown.sh @@ -0,0 +1,99 @@ +#!/usr/bin/env bash +# teardown.sh - reset the integration-test target repo to a known-good base +# state. Idempotent: safe to re-run. +# +# Usage: +# teardown.sh +# +# What it does (all against the *remote* repo via gh / git push --force): +# 1. Force-reset main to . +# 2. Close all issues labeled `autoloop-program` (also the +# `[Autoloop: ...]` status issues). +# 3. Close all open PRs whose head branch starts with `autoloop/` or +# whose head branch is `install-autoloop`. +# 4. Delete all remote branches matching `autoloop/*`, +# `install-autoloop`, and `memory/autoloop`. +# +# Requires: gh authenticated with write access to ; git on PATH. +set -euo pipefail + +REPO="${1:?usage: teardown.sh }" +BASE_SHA="${2:?usage: teardown.sh }" + +log() { echo "TEARDOWN: $*"; } +warn() { echo "TEARDOWN WARN: $*" >&2; } + +# 1. Force-reset main to base-sha. Use a temp clone to avoid touching the +# caller's working dir. +TMP="$(mktemp -d -t autoloop-teardown-XXXXXX)" +trap 'rm -rf "$TMP"' EXIT + +log "cloning $REPO to reset main -> $BASE_SHA" +git clone --quiet "https://github.com/${REPO}.git" "$TMP/repo" +( + cd "$TMP/repo" + # Only force-push if main is not already at base sha. + CURRENT="$(git rev-parse origin/main)" + if [ "$CURRENT" != "$BASE_SHA" ]; then + log "main is at $CURRENT; resetting to $BASE_SHA" + git checkout --quiet -B main "$BASE_SHA" + git push --force --quiet origin main + else + log "main already at $BASE_SHA; no reset needed" + fi +) || warn "main reset failed (continuing)" + +# 2. Close `autoloop-program`-labelled issues. +log "closing autoloop-program issues" +mapfile -t ISSUES < <(gh issue list --repo "$REPO" --label autoloop-program \ + --state open --json number --jq '.[].number' 2>/dev/null || true) +for n in "${ISSUES[@]:-}"; do + [ -z "$n" ] && continue + gh issue close "$n" --repo "$REPO" --reason "not planned" \ + --comment "Closed by install-integration test teardown." \ + >/dev/null 2>&1 || warn "could not close issue #$n" + log "closed issue #$n" +done + +# Also catch `[Autoloop: ...]`-titled issues that lost the label or were +# auto-created without it (defensive). +mapfile -t TITLED < <(gh issue list --repo "$REPO" --state open --search '[Autoloop:' \ + --json number,title --jq '.[] | select(.title | startswith("[Autoloop:")) | .number' 2>/dev/null || true) +for n in "${TITLED[@]:-}"; do + [ -z "$n" ] && continue + gh issue close "$n" --repo "$REPO" --reason "not planned" \ + --comment "Closed by install-integration test teardown." \ + >/dev/null 2>&1 || warn "could not close titled issue #$n" + log "closed titled issue #$n" +done + +# 3. Close open PRs from autoloop/* and install-autoloop branches. +log "closing test PRs" +mapfile -t PRS < <(gh pr list --repo "$REPO" --state open \ + --json number,headRefName \ + --jq '.[] | select(.headRefName | startswith("autoloop/") or . == "install-autoloop") | .number' \ + 2>/dev/null || true) +for n in "${PRS[@]:-}"; do + [ -z "$n" ] && continue + gh pr close "$n" --repo "$REPO" --delete-branch \ + --comment "Closed by install-integration test teardown." \ + >/dev/null 2>&1 || warn "could not close PR #$n" + log "closed PR #$n" +done + +# 4. Delete any remaining branches we created (gh pr close --delete-branch +# handles most, but autoloop/* branches without a PR also exist). +log "deleting test branches" +mapfile -t BRANCHES < <(gh api "repos/${REPO}/branches" --paginate \ + --jq '.[].name' 2>/dev/null || true) +for b in "${BRANCHES[@]:-}"; do + case "$b" in + autoloop/*|install-autoloop|memory/autoloop) + gh api -X DELETE "repos/${REPO}/git/refs/heads/${b}" \ + >/dev/null 2>&1 || warn "could not delete branch $b" + log "deleted branch $b" + ;; + esac +done + +log "done" diff --git a/tests/install-integration/verify-phase1.sh b/tests/install-integration/verify-phase1.sh new file mode 100755 index 0000000..9acba83 --- /dev/null +++ b/tests/install-integration/verify-phase1.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash +# verify-phase1.sh - assert install.md produced the expected files and that +# `gh aw compile autoloop` is idempotent. +# +# Usage: verify-phase1.sh +# +# The checkout must be on the install branch (post-`gh aw init`, +# post-copy-files, post-`gh aw compile autoloop`). All paths are relative +# to that checkout. +set -euo pipefail + +CHECKOUT="${1:?usage: verify-phase1.sh }" +cd "$CHECKOUT" + +fail() { echo "PHASE1 FAIL: $*" >&2; exit 1; } +ok() { echo "PHASE1 ok: $*"; } + +require_file() { + [ -f "$1" ] || fail "missing file: $1" + ok "file exists: $1" +} + +require_dir() { + [ -d "$1" ] || fail "missing directory: $1" + ok "dir exists: $1" +} + +# --- gh aw init artifacts ------------------------------------------------- +require_file ".gitattributes" + +# --- autoloop workflow files copied from this repo ------------------------ +require_file ".github/workflows/autoloop.md" +require_dir ".github/workflows/shared" + +# Issue #52: when sync-branches is removed, only autoloop.md should exist. +# Until then, sync-branches.md must also be present. Detect from the +# autoloop source repo (cloned in the test driver) and verify accordingly. +if [ -n "${EXPECT_SYNC_BRANCHES:-}" ] && [ "$EXPECT_SYNC_BRANCHES" = "1" ]; then + require_file ".github/workflows/sync-branches.md" + require_file ".github/workflows/sync-branches.lock.yml" +fi + +# --- compiled lock file --------------------------------------------------- +require_file ".github/workflows/autoloop.lock.yml" + +# --- issue template ------------------------------------------------------- +require_file ".github/ISSUE_TEMPLATE/autoloop-program.md" + +# --- programs directory present (may be empty) ---------------------------- +require_dir ".autoloop/programs" + +# --- lock idempotency: re-running compile must not change the lock file -- +LOCK=".github/workflows/autoloop.lock.yml" +sha256() { shasum -a 256 "$1" | awk '{print $1}'; } +SHA_BEFORE="$(sha256 "$LOCK")" +ok "lock sha256 before: $SHA_BEFORE" + +# Re-run the compiler. If it fails or changes the lock, that's a phase-1 +# failure (install.md said this command is the way to compile, so it must +# be idempotent). +gh aw compile autoloop >/dev/null +SHA_AFTER="$(sha256 "$LOCK")" +ok "lock sha256 after: $SHA_AFTER" + +if [ "$SHA_BEFORE" != "$SHA_AFTER" ]; then + echo "--- diff ---" >&2 + git --no-pager diff -- "$LOCK" >&2 || true + fail "gh aw compile is non-idempotent: lock file changed on second run" +fi +ok "lock file is idempotent" + +# --- install PR exists ---------------------------------------------------- +# The driver passes the PR URL via INSTALL_PR. Sanity-check it points at the +# target repo and a real PR number. +if [ -n "${INSTALL_PR:-}" ]; then + if ! [[ "$INSTALL_PR" =~ ^https://github\.com/[^/]+/[^/]+/pull/[0-9]+$ ]]; then + fail "INSTALL_PR is not a well-formed PR URL: $INSTALL_PR" + fi + ok "install PR URL looks valid: $INSTALL_PR" +fi + +echo "PHASE1 PASS" diff --git a/tests/install-integration/verify-phase2.sh b/tests/install-integration/verify-phase2.sh new file mode 100755 index 0000000..071609c --- /dev/null +++ b/tests/install-integration/verify-phase2.sh @@ -0,0 +1,120 @@ +#!/usr/bin/env bash +# verify-phase2.sh - per-program assertions for one Phase-2 program run. +# +# Usage: +# verify-phase2.sh +# +# is one of: openevolve | test-driven | plain +# +# Asserts: +# 1. Workflow run completed with conclusion `success` (the `agent` job exited +# success, regardless of accept/reject of the iteration). +# 2. A program issue exists for . +# 3. The status comment () is present on the issue. +# 4. State file `.md` exists on the `memory/autoloop` branch. +# 5. Branch `autoloop/` exists OR the iteration was rejected. +# 6. Strategy-specific subsection in the state file: +# openevolve -> contains `## 🧬 Population` +# test-driven -> contains `## ✅ Test Harness` +# plain -> contains `## 📊 Iteration History` +# and does NOT contain Population or Test Harness +# (negative assertion). +set -euo pipefail + +REPO="${1:?usage: verify-phase2.sh }" +PROGRAM="${2:?missing }" +RUN_ID="${3:?missing }" +STRATEGY="${4:?missing }" + +fail() { echo "PHASE2 [$PROGRAM] FAIL: $*" >&2; exit 1; } +ok() { echo "PHASE2 [$PROGRAM] ok: $*"; } + +# 1. Workflow run conclusion. +CONCLUSION="$(gh run view "$RUN_ID" --repo "$REPO" --json conclusion --jq '.conclusion' 2>/dev/null || echo "")" +if [ "$CONCLUSION" != "success" ]; then + fail "workflow run $RUN_ID conclusion=$CONCLUSION (want: success)" +fi +ok "workflow run $RUN_ID conclusion=success" + +# 2. Program issue exists. For issue-based programs the issue was created by +# the test driver before the run; for file-based programs the iteration +# auto-creates one. Either way, search by title. +ISSUE_NUMBER="$(gh issue list --repo "$REPO" --state all --search "[Autoloop: $PROGRAM]" \ + --json number,title \ + --jq ".[] | select(.title == \"[Autoloop: $PROGRAM]\") | .number" \ + 2>/dev/null | head -1)" +if [ -z "$ISSUE_NUMBER" ]; then + fail "no program issue titled [Autoloop: $PROGRAM] found" +fi +ok "program issue: #$ISSUE_NUMBER" + +# 3. Status comment present. +STATUS_HIT="$(gh issue view "$ISSUE_NUMBER" --repo "$REPO" --comments \ + --json comments --jq '.comments[].body' 2>/dev/null \ + | grep -c '' || true)" +if [ "${STATUS_HIT:-0}" -lt 1 ]; then + fail "no comment on issue #$ISSUE_NUMBER" +fi +ok "status comment present on #$ISSUE_NUMBER" + +# 4. State file on memory/autoloop branch. +STATE_FILE="${PROGRAM}.md" +STATE_BODY="$(gh api "repos/${REPO}/contents/${STATE_FILE}?ref=memory/autoloop" \ + --jq '.content' 2>/dev/null | base64 --decode 2>/dev/null || true)" +if [ -z "$STATE_BODY" ]; then + fail "state file ${STATE_FILE} missing on memory/autoloop branch" +fi +ok "state file present: memory/autoloop:${STATE_FILE}" + +if ! echo "$STATE_BODY" | grep -q 'Machine State'; then + fail "state file has no 'Machine State' table" +fi +ok "state file has Machine State table" + +# 5. autoloop/ branch exists OR iteration was rejected. +BRANCH_OK=0 +if gh api "repos/${REPO}/branches/autoloop/${PROGRAM}" >/dev/null 2>&1; then + BRANCH_OK=1 + ok "branch autoloop/${PROGRAM} exists" +else + # Acceptable only if the iteration was rejected. Look for the marker in + # the latest per-iteration comment on the issue. + if gh issue view "$ISSUE_NUMBER" --repo "$REPO" --comments \ + --json comments --jq '.comments[].body' 2>/dev/null \ + | grep -qiE 'reject|rejected'; then + ok "no autoloop branch, but iteration was rejected (acceptable)" + BRANCH_OK=1 + fi +fi +[ "$BRANCH_OK" = "1" ] || fail "no autoloop/${PROGRAM} branch and no rejection marker" + +# 6. Strategy-specific subsection. +case "$STRATEGY" in + openevolve) + echo "$STATE_BODY" | grep -q '## 🧬 Population' \ + || fail "openevolve: state file missing '## 🧬 Population'" + ok "state file has '## 🧬 Population'" + ;; + test-driven) + echo "$STATE_BODY" | grep -q '## ✅ Test Harness' \ + || fail "test-driven: state file missing '## ✅ Test Harness'" + ok "state file has '## ✅ Test Harness'" + ;; + plain) + echo "$STATE_BODY" | grep -q '## 📊 Iteration History' \ + || fail "plain: state file missing '## 📊 Iteration History'" + ok "state file has '## 📊 Iteration History'" + if echo "$STATE_BODY" | grep -q '## 🧬 Population'; then + fail "plain: state file unexpectedly contains '## 🧬 Population' (strategy bleed)" + fi + if echo "$STATE_BODY" | grep -q '## ✅ Test Harness'; then + fail "plain: state file unexpectedly contains '## ✅ Test Harness' (strategy bleed)" + fi + ok "no strategy-section bleed into plain program" + ;; + *) + fail "unknown strategy: $STRATEGY" + ;; +esac + +echo "PHASE2 [$PROGRAM] PASS"