From 06bbd9bf7eac93f52fe83d15ceb37c022deb8dd8 Mon Sep 17 00:00:00 2001 From: Daniel Meppiel Date: Mon, 27 Apr 2026 11:02:33 +0200 Subject: [PATCH] Add evals suite to pr-description-skill (genesis-driven) Adds a self-contained evals/ bundle and a non-interactive runner under .apm/skills/pr-description-skill/. Mirror regenerated via apm install --target copilot. - evals/evals.json: top-level manifest + ship gates - evals/triggers.json: 18 trigger items (9 fire / 9 no-fire), 60/40 train/val split - evals/content/*.json: 3 PR-shape scenarios (cross-cutting refactor, docs-only, mechanical dep bump) each with regex rubric and with_skill/without_skill fixtures - evals/fixtures/*.md: pre-recorded sample outputs for the two conditions; deterministic, no LLM API required at runtime - scripts/run_evals.py: stdlib-only runner; --filter, --split, --quiet, --no-write, --help; JSON on stdout, diagnostics on stderr, exit codes 0/1/2 - evals/results/.gitkeep + evals/.gitignore: track the dir, ignore generated result JSON - evals/README.md: how to run, scoring rules, gates, how to add evals, regeneration policy Genesis handoff packet persisted out-of-tree at the session state dir; design ends at step 6, this commit is step 7b output. Validation (val split): - triggers should-fire correctness = 1.0 (gate >= 0.5) - triggers should-not-fire correctness = 1.0 (gate >= 0.5) - content delta_anchors per scenario = 11, 6, 9 (gate >= 1) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../pr-description-skill/evals/.gitignore | 2 + .../pr-description-skill/evals/README.md | 132 +++++++ .../evals/content/auth-refactor.json | 43 +++ .../evals/content/dep-bump.json | 37 ++ .../evals/content/docs-only.json | 34 ++ .../pr-description-skill/evals/evals.json | 62 ++++ .../fixtures/auth-refactor__with_skill.md | 105 ++++++ .../fixtures/auth-refactor__without_skill.md | 32 ++ .../evals/fixtures/dep-bump__with_skill.md | 52 +++ .../evals/fixtures/dep-bump__without_skill.md | 13 + .../evals/fixtures/docs-only__with_skill.md | 63 ++++ .../fixtures/docs-only__without_skill.md | 14 + .../evals/results/.gitkeep | 0 .../pr-description-skill/evals/triggers.json | 131 +++++++ .../pr-description-skill/scripts/run_evals.py | 343 ++++++++++++++++++ .../pr-description-skill/evals/.gitignore | 2 + .../pr-description-skill/evals/README.md | 132 +++++++ .../evals/content/auth-refactor.json | 43 +++ .../evals/content/dep-bump.json | 37 ++ .../evals/content/docs-only.json | 34 ++ .../pr-description-skill/evals/evals.json | 62 ++++ .../fixtures/auth-refactor__with_skill.md | 105 ++++++ .../fixtures/auth-refactor__without_skill.md | 32 ++ .../evals/fixtures/dep-bump__with_skill.md | 52 +++ .../evals/fixtures/dep-bump__without_skill.md | 13 + .../evals/fixtures/docs-only__with_skill.md | 63 ++++ .../fixtures/docs-only__without_skill.md | 14 + .../evals/results/.gitkeep | 0 .../pr-description-skill/evals/triggers.json | 131 +++++++ .../pr-description-skill/scripts/run_evals.py | 343 ++++++++++++++++++ 30 files changed, 2126 insertions(+) create mode 100644 .apm/skills/pr-description-skill/evals/.gitignore create mode 100644 .apm/skills/pr-description-skill/evals/README.md create mode 100644 .apm/skills/pr-description-skill/evals/content/auth-refactor.json create mode 100644 .apm/skills/pr-description-skill/evals/content/dep-bump.json create mode 100644 .apm/skills/pr-description-skill/evals/content/docs-only.json create mode 100644 .apm/skills/pr-description-skill/evals/evals.json create mode 100644 .apm/skills/pr-description-skill/evals/fixtures/auth-refactor__with_skill.md create mode 100644 .apm/skills/pr-description-skill/evals/fixtures/auth-refactor__without_skill.md create mode 100644 .apm/skills/pr-description-skill/evals/fixtures/dep-bump__with_skill.md create mode 100644 .apm/skills/pr-description-skill/evals/fixtures/dep-bump__without_skill.md create mode 100644 .apm/skills/pr-description-skill/evals/fixtures/docs-only__with_skill.md create mode 100644 .apm/skills/pr-description-skill/evals/fixtures/docs-only__without_skill.md create mode 100644 .apm/skills/pr-description-skill/evals/results/.gitkeep create mode 100644 .apm/skills/pr-description-skill/evals/triggers.json create mode 100644 .apm/skills/pr-description-skill/scripts/run_evals.py create mode 100644 .github/skills/pr-description-skill/evals/.gitignore create mode 100644 .github/skills/pr-description-skill/evals/README.md create mode 100644 .github/skills/pr-description-skill/evals/content/auth-refactor.json create mode 100644 .github/skills/pr-description-skill/evals/content/dep-bump.json create mode 100644 .github/skills/pr-description-skill/evals/content/docs-only.json create mode 100644 .github/skills/pr-description-skill/evals/evals.json create mode 100644 .github/skills/pr-description-skill/evals/fixtures/auth-refactor__with_skill.md create mode 100644 .github/skills/pr-description-skill/evals/fixtures/auth-refactor__without_skill.md create mode 100644 .github/skills/pr-description-skill/evals/fixtures/dep-bump__with_skill.md create mode 100644 .github/skills/pr-description-skill/evals/fixtures/dep-bump__without_skill.md create mode 100644 .github/skills/pr-description-skill/evals/fixtures/docs-only__with_skill.md create mode 100644 .github/skills/pr-description-skill/evals/fixtures/docs-only__without_skill.md create mode 100644 .github/skills/pr-description-skill/evals/results/.gitkeep create mode 100644 .github/skills/pr-description-skill/evals/triggers.json create mode 100644 .github/skills/pr-description-skill/scripts/run_evals.py diff --git a/.apm/skills/pr-description-skill/evals/.gitignore b/.apm/skills/pr-description-skill/evals/.gitignore new file mode 100644 index 000000000..62279c73a --- /dev/null +++ b/.apm/skills/pr-description-skill/evals/.gitignore @@ -0,0 +1,2 @@ +# Ignore generated result files; track only the .gitkeep sentinel. +results/*.json diff --git a/.apm/skills/pr-description-skill/evals/README.md b/.apm/skills/pr-description-skill/evals/README.md new file mode 100644 index 000000000..5860c8d45 --- /dev/null +++ b/.apm/skills/pr-description-skill/evals/README.md @@ -0,0 +1,132 @@ +# pr-description-skill evals + +This bundle answers two questions deterministically and without +requiring an LLM API key: + +1. **TRIGGER EVALS**: does the SKILL.md `description:` reliably + match real should-fire intents and avoid near-miss queries? +2. **CONTENT EVALS**: does loading the SKILL.md body change the + shape of a PR description an agent produces, vs not loading + it? + +## Layout + +``` +evals/ + evals.json # top-level manifest (gates, keyword lists) + triggers.json # 18 trigger items (9 fire / 9 no-fire), + # ~60/40 train/val split + content/ + auth-refactor.json # cross-cutting refactor scenario + rubric + docs-only.json # docs-only PR scenario + rubric + dep-bump.json # mechanical dep bump scenario + rubric + fixtures/ + __with_skill.md # representative output produced + # under the SKILL.md guidance + __without_skill.md # representative output produced + # without the skill loaded + results/ + .gitkeep # tracked sentinel + .json # one file per run (gitignored) + README.md # this file +``` + +The runner script lives at +`.apm/skills/pr-description-skill/scripts/run_evals.py`. + +## Run + +From the repo root (or this skill's directory): + +``` +python .apm/skills/pr-description-skill/scripts/run_evals.py +``` + +Common options: + +| Flag | Effect | +|---|---| +| `--filter triggers` | Run only the trigger evals. | +| `--filter content` | Run only the content evals. | +| `--split train` | Score the train split for triggers (default is `val`, the ship gate). | +| `--split all` | Score both splits. | +| `--no-write` | Do not write to `evals/results/`. | +| `--quiet` | Suppress stderr diagnostics. | + +Exit codes: + +* `0` = all gates met +* `1` = one or more gates failed +* `2` = runner error (missing manifest, parse error, missing fixture) + +## Trigger eval scoring + +The runner uses a deterministic dispatcher approximation defined in +`evals.json`: + +1. If any phrase from `stop_list` appears verbatim in the query + (lowercased), predict `no_fire`. +2. Otherwise if any phrase from `trigger_keywords_primary` appears + verbatim, predict `fire`. +3. Otherwise count distinct tokens from + `trigger_keywords_secondary`; predict `fire` iff at least 3 + distinct tokens AND one of `{pr, pull}` is present. + +This is NOT a perfect proxy for an actual LLM dispatcher. It is +fast, deterministic, and CI-friendly. When `gh models` becomes +available, an `--llm` flag can be added that calls the real +dispatcher; the manifest schema already accommodates it. + +Ship gate (validation split): + +* should-fire correctness >= 0.5 +* should-not-fire correctness >= 0.5 (i.e. less than 50% of + near-miss queries leak through as `fire`) + +## Content eval scoring + +Each scenario ships two fixtures: one representing the agent's +output WITH the skill loaded, one representing the same task +WITHOUT it. The same regex rubric scores both. The reported +`delta_anchors` is the count of rubric anchors that fire on +`with_skill` but not on `without_skill`. + +Ship gate: every scenario has `delta_anchors >= 1`. A scenario +with zero delta is a signal that the skill is not adding +measurable value on that shape; the genesis discipline says +redesign or delete. (This runner only flags; deletion is a +human decision.) + +### Fixtures and the LLM-in-the-loop question + +The fixtures are pre-recorded representative outputs. They are +NOT regenerated by an LLM at run time. Two reasons: + +1. **Determinism**: re-running the suite must produce identical + results so regressions are visible. +2. **No required keys**: contributors should be able to run the + suite with only Python stdlib. + +When a maintainer materially changes SKILL.md, the fixtures +SHOULD be regenerated by hand (or by running the agent once on +each scenario, with and without the skill loaded, and pasting +the outputs in). Document the regeneration in the PR that +changes SKILL.md. + +## Adding a new eval + +* **New trigger eval**: append an entry to `triggers.json` with + a fresh `id`, the `query`, the `expected` outcome, and the + `split`. Keep the 60/40 ratio roughly intact. +* **New content scenario**: create `content/.json` (mirror + one of the existing files), add `fixtures/__with_skill.md` + and `fixtures/__without_skill.md`, and append the path to + `content_manifests` in `evals.json`. + +## Encoding + +All eval source files (Python, JSON, README) stay within +printable ASCII per the repo-wide encoding rule. Fixtures are +markdown intended to model PR-body output and MAY contain +Unicode, mirroring the SKILL.md "Output charset rule" -- but the +provided fixtures here remain ASCII for portability. diff --git a/.apm/skills/pr-description-skill/evals/content/auth-refactor.json b/.apm/skills/pr-description-skill/evals/content/auth-refactor.json new file mode 100644 index 000000000..3e4ae270d --- /dev/null +++ b/.apm/skills/pr-description-skill/evals/content/auth-refactor.json @@ -0,0 +1,43 @@ +{ + "schema_version": 1, + "id": "auth-refactor", + "summary": "Cross-cutting auth refactor: token resolution moves into a single AuthResolver; multiple call sites updated; one breaking change to a public CLI flag. PR is non-trivial and exercises every required section.", + "scenario": { + "branch": "refactor/auth-resolver", + "base": "main", + "files_changed": [ + "M src/apm_cli/auth/__init__.py", + "A src/apm_cli/auth/resolver.py", + "M src/apm_cli/cli.py", + "M src/apm_cli/integration/git.py", + "M tests/unit/auth/test_resolver.py", + "M CHANGELOG.md" + ], + "commits": [ + "feat(auth): introduce AuthResolver as single source of token resolution", + "refactor(cli): replace per-host PAT lookup with AuthResolver.get(host)", + "fix(integration/git): drop legacy GITHUB_APM_PAT fallback path", + "docs(changelog): note breaking removal of --token-source flag" + ], + "linked_issue": "#812 -- token resolution scattered across modules; intermittent EMU lookups failing", + "validation_evidence": "uv run pytest tests/unit/auth -x -> 142 passed in 3.4s\napm audit --ci -> 0 findings" + }, + "rubric": [ + {"id": "tldr-present", "pattern": "(?im)^\\s*##?\\s*TL;DR\\b", "weight": 1, "description": "TL;DR section header present"}, + {"id": "tldr-short", "pattern": "(?is)##?\\s*TL;DR.{1,800}?\\n##?\\s", "weight": 1, "description": "TL;DR is short (under 800 chars before next H2)"}, + {"id": "problem-section", "pattern": "(?im)^\\s*##?\\s*Problem\\b", "weight": 1, "description": "Problem (WHY) section present"}, + {"id": "anchored-quote", "pattern": "\\[\"[^\"]{8,}\"\\]\\(https?://", "weight": 2, "description": "At least one verbatim quote anchored to a URL"}, + {"id": "approach-or-implementation", "pattern": "(?im)^\\s*##?\\s*(Approach|Implementation)\\b", "weight": 1, "description": "Approach or Implementation section present"}, + {"id": "mermaid-block", "pattern": "(?s)```mermaid[\\s\\S]+?```", "weight": 2, "description": "At least one mermaid diagram block"}, + {"id": "tradeoffs-section", "pattern": "(?im)^\\s*##?\\s*Trade-?offs?\\b", "weight": 1, "description": "Trade-offs section present"}, + {"id": "validation-section", "pattern": "(?im)^\\s*##?\\s*Validation\\b", "weight": 1, "description": "Validation section present"}, + {"id": "validation-real-output", "pattern": "(?im)pytest|apm audit|uv run", "weight": 1, "description": "Validation shows real command output"}, + {"id": "how-to-test", "pattern": "(?im)^\\s*##?\\s*How to test\\b", "weight": 1, "description": "How to test section present"}, + {"id": "trailer", "pattern": "Co-authored-by: Copilot <223556219", "weight": 1, "description": "Copilot co-author trailer present"}, + {"id": "no-marketing-tone", "pattern": "(?i)significantly enhances|best-in-class|game-changing|revolutionary", "weight": -2, "description": "PENALTY: marketing tone detected"} + ], + "fixtures": { + "with_skill": "../fixtures/auth-refactor__with_skill.md", + "without_skill": "../fixtures/auth-refactor__without_skill.md" + } +} diff --git a/.apm/skills/pr-description-skill/evals/content/dep-bump.json b/.apm/skills/pr-description-skill/evals/content/dep-bump.json new file mode 100644 index 000000000..1deee05a1 --- /dev/null +++ b/.apm/skills/pr-description-skill/evals/content/dep-bump.json @@ -0,0 +1,37 @@ +{ + "schema_version": 1, + "id": "dep-bump", + "summary": "Mechanical dependency upgrade: bump click from 8.1.7 to 8.2.1; lockfile-only changes plus a 1-line type annotation tweak. Tests Trade-offs may be 1-2 bullets and mermaid is optional.", + "scenario": { + "branch": "deps/click-8.2.1", + "base": "main", + "files_changed": [ + "M pyproject.toml", + "M uv.lock", + "M src/apm_cli/cli.py" + ], + "commits": [ + "chore(deps): bump click 8.1.7 -> 8.2.1", + "chore(cli): adjust type hint for new click.Context generic" + ], + "linked_issue": "(none)", + "validation_evidence": "uv sync --extra dev -> resolved in 2.1s\nuv run pytest tests/unit -x -> 2418 passed in 38.7s" + }, + "rubric": [ + {"id": "tldr-present", "pattern": "(?im)^\\s*##?\\s*TL;DR\\b", "weight": 1, "description": "TL;DR present"}, + {"id": "tldr-short", "pattern": "(?is)##?\\s*TL;DR.{1,500}?\\n##?\\s", "weight": 1, "description": "TL;DR very short on a mechanical PR"}, + {"id": "problem-section", "pattern": "(?im)^\\s*##?\\s*Problem\\b", "weight": 1, "description": "Problem section present"}, + {"id": "implementation-section", "pattern": "(?im)^\\s*##?\\s*Implementation\\b", "weight": 1, "description": "Implementation section present"}, + {"id": "tradeoffs-section", "pattern": "(?im)^\\s*##?\\s*Trade-?offs?\\b", "weight": 1, "description": "Trade-offs section present (may be 1-2 bullets)"}, + {"id": "validation-section", "pattern": "(?im)^\\s*##?\\s*Validation\\b", "weight": 1, "description": "Validation section present"}, + {"id": "validation-real-output", "pattern": "(?im)pytest|uv sync|2418 passed", "weight": 1, "description": "Real validation output (pytest counts) shown"}, + {"id": "how-to-test", "pattern": "(?im)^\\s*##?\\s*How to test\\b", "weight": 1, "description": "How to test section present"}, + {"id": "trailer", "pattern": "Co-authored-by: Copilot <223556219", "weight": 1, "description": "Copilot co-author trailer present"}, + {"id": "no-marketing-tone", "pattern": "(?i)significantly enhances|best-in-class|game-changing", "weight": -2, "description": "PENALTY: marketing tone detected"}, + {"id": "no-diff-restate", "pattern": "(?im)^[+-]{3} ", "weight": -1, "description": "PENALTY: diff lines restated in body"} + ], + "fixtures": { + "with_skill": "../fixtures/dep-bump__with_skill.md", + "without_skill": "../fixtures/dep-bump__without_skill.md" + } +} diff --git a/.apm/skills/pr-description-skill/evals/content/docs-only.json b/.apm/skills/pr-description-skill/evals/content/docs-only.json new file mode 100644 index 000000000..c3bc93c08 --- /dev/null +++ b/.apm/skills/pr-description-skill/evals/content/docs-only.json @@ -0,0 +1,34 @@ +{ + "schema_version": 1, + "id": "docs-only", + "summary": "Markdown-only PR adding a quickstart guide. The skill must keep TL;DR, Problem, Validation, How-to-test sections even though the PR is 'trivial' (per SKILL.md gotcha).", + "scenario": { + "branch": "docs/quickstart", + "base": "main", + "files_changed": [ + "A docs/src/content/docs/guides/quickstart.md", + "M docs/src/content/docs/index.md" + ], + "commits": [ + "docs(quickstart): add 5-minute getting-started guide", + "docs(index): link quickstart from landing page" + ], + "linked_issue": "#640 -- new contributors bounce after install; need a 5-minute happy path", + "validation_evidence": "npm run --prefix docs build -> built in 4.2s, no broken links\napm audit --ci -> 0 findings" + }, + "rubric": [ + {"id": "tldr-present", "pattern": "(?im)^\\s*##?\\s*TL;DR\\b", "weight": 1, "description": "TL;DR present even on docs-only PR"}, + {"id": "problem-section", "pattern": "(?im)^\\s*##?\\s*Problem\\b", "weight": 1, "description": "Problem section present even on docs-only PR (per SKILL.md gotcha)"}, + {"id": "anchored-quote", "pattern": "\\[\"[^\"]{8,}\"\\]\\(https?://", "weight": 2, "description": "At least one verbatim anchored quote"}, + {"id": "validation-section", "pattern": "(?im)^\\s*##?\\s*Validation\\b", "weight": 1, "description": "Validation section present"}, + {"id": "validation-build", "pattern": "(?im)npm run|build|broken link", "weight": 1, "description": "Real validation evidence (build/link check) shown"}, + {"id": "how-to-test", "pattern": "(?im)^\\s*##?\\s*How to test\\b", "weight": 1, "description": "How to test present"}, + {"id": "trailer", "pattern": "Co-authored-by: Copilot <223556219", "weight": 1, "description": "Copilot co-author trailer present"}, + {"id": "no-trivial-skip", "pattern": "(?i)the PR is trivial|too small to need", "weight": -2, "description": "PENALTY: skipping sections because PR is small"}, + {"id": "no-marketing-tone", "pattern": "(?i)significantly enhances|best-in-class|game-changing", "weight": -2, "description": "PENALTY: marketing tone detected"} + ], + "fixtures": { + "with_skill": "../fixtures/docs-only__with_skill.md", + "without_skill": "../fixtures/docs-only__without_skill.md" + } +} diff --git a/.apm/skills/pr-description-skill/evals/evals.json b/.apm/skills/pr-description-skill/evals/evals.json new file mode 100644 index 000000000..f7a8750ea --- /dev/null +++ b/.apm/skills/pr-description-skill/evals/evals.json @@ -0,0 +1,62 @@ +{ + "schema_version": 1, + "skill": "pr-description-skill", + "skill_path": "../SKILL.md", + "triggers_manifest": "triggers.json", + "content_manifests": [ + "content/auth-refactor.json", + "content/docs-only.json", + "content/dep-bump.json" + ], + "gates": { + "triggers": { + "split": "val", + "should_fire_rate_min": 0.5, + "should_not_fire_rate_max": 0.5 + }, + "content": { + "delta_min_anchors": 1, + "scope": "every_scenario" + } + }, + "stop_list": [ + "commit message", + "release note", + "release notes", + "changelog", + "open an issue", + "open issue", + "review this pr", + "code review comment", + "design doc", + "summarize the diff" + ], + "trigger_keywords_primary": [ + "pr description", + "pr body", + "pull request description", + "pull request body", + "pr template", + "pr write-up", + "pr writeup", + "draft the pr", + "draft a pr", + "open the pr", + "open a pr", + "summarize this branch as a pr", + "fill in the pr template" + ], + "trigger_keywords_secondary": [ + "pr", + "pull", + "request", + "description", + "body", + "draft", + "open", + "write", + "fill", + "summarize", + "branch" + ] +} diff --git a/.apm/skills/pr-description-skill/evals/fixtures/auth-refactor__with_skill.md b/.apm/skills/pr-description-skill/evals/fixtures/auth-refactor__with_skill.md new file mode 100644 index 000000000..0078a04a9 --- /dev/null +++ b/.apm/skills/pr-description-skill/evals/fixtures/auth-refactor__with_skill.md @@ -0,0 +1,105 @@ +refactor(auth): centralize token resolution behind AuthResolver + +## TL;DR + +Token resolution was scattered across `cli.py`, `integration/git.py`, +and ad-hoc `os.environ` lookups, producing intermittent EMU failures +and a `--token-source` flag whose semantics no two callers agreed +on. This PR introduces a single `AuthResolver` and removes the +flag. One breaking change (flag removal) is documented in the +CHANGELOG. + +## Problem (WHY) + +- Three modules each implemented their own host-to-token mapping, + diverging on EMU host suffixes (`#812`). +- The `--token-source` CLI flag let callers override resolution + per-invocation but was respected only by `cli.py`, silently + ignored by the git integration layer. +- This produced support reports of "auth works in `apm install` + but fails in `apm compile`" for the same machine. + +The skill's PROSE foundation calls this out directly: +["Grounding outputs in deterministic tool execution transforms probabilistic generation into verifiable action."](https://danielmeppiel.github.io/awesome-ai-native/docs/prose/) +-- token resolution that branches by call site is the opposite of +deterministic. + +## Approach (WHAT) + +A single `AuthResolver` owns the host-to-token mapping. Callers +ask `AuthResolver.get(host)` and get a token or a typed error. +The `--token-source` flag is removed; the resolver consults a +fixed precedence (env -> keyring -> gh auth) that is identical +for every caller. + +## Implementation (HOW) + +| File | Change | +|---|---| +| `src/apm_cli/auth/resolver.py` | NEW. `AuthResolver` class with `get(host)`, fixed precedence chain. | +| `src/apm_cli/auth/__init__.py` | Re-export `AuthResolver`. | +| `src/apm_cli/cli.py` | Drop `--token-source`; route all auth lookups through `AuthResolver`. | +| `src/apm_cli/integration/git.py` | Drop legacy `GITHUB_APM_PAT` fallback; delegate to resolver. | +| `tests/unit/auth/test_resolver.py` | New unit tests for resolver precedence and EMU host parsing. | +| `CHANGELOG.md` | Note `--token-source` removal under Breaking Changes. | + +## Diagram + +The resolver becomes the only edge from callers to the token +sources; the legacy fan-out is collapsed. + +```mermaid +flowchart LR + CLI["apm cli command"] --> R["AuthResolver"] + GIT["git integration"] --> R + R --> ENV["environment variables"] + R --> KR["OS keyring"] + R --> GH["gh auth status"] +``` + +## Trade-offs + +- **Removed `--token-source`**: minor break for power users who + scripted around the flag; mitigated by the env-var path which + covers every documented use case. +- **Single precedence chain**: any future per-host policy (e.g. + ADO_APM_PAT for Azure DevOps) plugs in as a new resolver + source, not a new caller-side branch. +- **Rejected**: keeping the flag and routing it through the + resolver. That preserves the bug surface (callers still + passing different flag values) without adding value. + +## Benefits + +1. One source of truth for auth (closes #812). +2. EMU host suffixes parsed in one place. +3. Test surface shrinks from 3 modules to 1. +4. Removes 84 lines of duplicated env-var lookup code. + +## Validation + +
Full test output + +``` +$ uv run pytest tests/unit/auth -x +============================= test session starts ============================= +collected 142 items +tests/unit/auth/test_resolver.py ............................................ +142 passed in 3.4s + +$ apm audit --ci +0 findings +``` + +
+ +## How to test + +- [ ] `uv sync --extra dev` +- [ ] `uv run pytest tests/unit/auth -x` (expect 142 passing) +- [ ] `apm install microsoft/apm/packages/apm-guide` against an + EMU host; confirm the install completes without prompting. +- [ ] Try `apm install --token-source=env`; expect a clear + "unknown option" error (flag is gone). + +Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> diff --git a/.apm/skills/pr-description-skill/evals/fixtures/auth-refactor__without_skill.md b/.apm/skills/pr-description-skill/evals/fixtures/auth-refactor__without_skill.md new file mode 100644 index 000000000..10b6d2119 --- /dev/null +++ b/.apm/skills/pr-description-skill/evals/fixtures/auth-refactor__without_skill.md @@ -0,0 +1,32 @@ +# Auth refactor + +This PR refactors authentication. It centralizes token resolution +into a new AuthResolver class and removes the --token-source flag. + +## What changed + +- Added a new file src/apm_cli/auth/resolver.py with the + AuthResolver class. +- Updated src/apm_cli/auth/__init__.py to export AuthResolver. +- Updated src/apm_cli/cli.py to use AuthResolver for all token + lookups and removed the --token-source flag. +- Updated src/apm_cli/integration/git.py to delegate to + AuthResolver and dropped the GITHUB_APM_PAT fallback. +- Added unit tests in tests/unit/auth/test_resolver.py. +- Updated CHANGELOG.md. + +## Why + +The previous code had token resolution logic scattered across +multiple modules. This was fragile and led to bugs. Centralizing +it makes the code cleaner and easier to maintain. This is a +significantly enhanced approach to authentication. + +## Testing + +I ran the tests and they passed. The audit also passed. + +## Notes + +This is a breaking change because the --token-source flag is +removed. diff --git a/.apm/skills/pr-description-skill/evals/fixtures/dep-bump__with_skill.md b/.apm/skills/pr-description-skill/evals/fixtures/dep-bump__with_skill.md new file mode 100644 index 000000000..06d48f629 --- /dev/null +++ b/.apm/skills/pr-description-skill/evals/fixtures/dep-bump__with_skill.md @@ -0,0 +1,52 @@ +chore(deps): bump click 8.1.7 -> 8.2.1 + +## TL;DR + +Routine click upgrade. Picks up the new `Context` generic typing +and a fix for boolean-flag short forms. One type hint adjusted in +`cli.py`. Lockfile regenerated; full unit suite green. + +## Problem (WHY) + +- click 8.2.x ships a typing fix flagged by mypy on Python 3.12. +- Staying on 8.1.7 means our type checks fail under the supported + Python matrix. + +The skill's anchor for cite-or-omit applies here too: +["Add what the agent lacks, omit what it knows"](https://agentskills.io/skill-creation/best-practices) +-- this PR body says only what the diff alone cannot. + +## Approach (WHAT) + +Mechanical. Bump version pin, regenerate lockfile, adjust one +type hint. + +## Implementation (HOW) + +| File | Change | +|---|---| +| `pyproject.toml` | `click = "^8.2.1"` (was `^8.1.7`). | +| `uv.lock` | Regenerated by `uv sync`. | +| `src/apm_cli/cli.py` | One type hint updated for new `click.Context` generic. | + +## Trade-offs + +- **Direct bump vs caret-locking to 8.1**: chose direct bump + because the typing fix is the reason for the upgrade. + +## Validation + +``` +$ uv sync --extra dev +resolved in 2.1s +$ uv run pytest tests/unit -x +2418 passed in 38.7s +``` + +## How to test + +- [ ] `uv sync --extra dev` +- [ ] `uv run pytest tests/unit -x` (expect 2418 passing) +- [ ] `uv run mypy src/apm_cli` (expect 0 errors) + +Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> diff --git a/.apm/skills/pr-description-skill/evals/fixtures/dep-bump__without_skill.md b/.apm/skills/pr-description-skill/evals/fixtures/dep-bump__without_skill.md new file mode 100644 index 000000000..21ccd2594 --- /dev/null +++ b/.apm/skills/pr-description-skill/evals/fixtures/dep-bump__without_skill.md @@ -0,0 +1,13 @@ +# Bump click + +This bumps click from 8.1.7 to 8.2.1. + +I also adjusted a type hint in cli.py because click.Context is +now generic. + +The lockfile was regenerated. + +This is a game-changing dependency upgrade for our type-checking +story. + +Tests pass. diff --git a/.apm/skills/pr-description-skill/evals/fixtures/docs-only__with_skill.md b/.apm/skills/pr-description-skill/evals/fixtures/docs-only__with_skill.md new file mode 100644 index 000000000..382f5470b --- /dev/null +++ b/.apm/skills/pr-description-skill/evals/fixtures/docs-only__with_skill.md @@ -0,0 +1,63 @@ +docs(quickstart): add 5-minute getting-started guide + +## TL;DR + +New contributors land on the docs index, scroll, and bounce. +This PR adds a 5-minute quickstart linked from the landing page +so first-touch users have a happy path before reading reference +material. + +## Problem (WHY) + +- Issue #640 reports a 60% bounce rate on `/docs/` for new + visitors. +- The current landing page lists every guide in alphabetical + order; "Quickstart" did not exist. +- Agent Skills makes the diagnosis explicit: + ["agents pattern-match well against concrete structures"](https://agentskills.io/skill-creation/best-practices) + -- and the same is true for human readers landing cold. + +## Approach (WHAT) + +Additive: see Implementation. One new page, one link added on +the index. + +## Implementation (HOW) + +| File | Change | +|---|---| +| `docs/src/content/docs/guides/quickstart.md` | NEW. 5-minute happy path: install, init, run. | +| `docs/src/content/docs/index.md` | Promote quickstart link above the alphabetical guide list. | + +## Trade-offs + +- **Promoting one guide above the list**: minor visual + asymmetry; pays for itself if bounce rate drops. +- **Rejected**: rewriting the landing page. Higher risk, larger + diff, harder to A/B. + +## Benefits + +1. Closes #640 if bounce rate drops measurably. +2. New contributor first-touch falls under 5 minutes. +3. Zero code change; rollback is `git revert`. + +## Validation + +``` +$ npm run --prefix docs build +built in 4.2s +no broken links +$ apm audit --ci +0 findings +``` + +## How to test + +- [ ] `npm run --prefix docs dev` +- [ ] Open `http://localhost:4321/docs/`; confirm Quickstart + link is above the fold. +- [ ] Click through; confirm the 5 steps complete on a fresh + clone in under 5 minutes. + +Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> diff --git a/.apm/skills/pr-description-skill/evals/fixtures/docs-only__without_skill.md b/.apm/skills/pr-description-skill/evals/fixtures/docs-only__without_skill.md new file mode 100644 index 000000000..5dddbc2fc --- /dev/null +++ b/.apm/skills/pr-description-skill/evals/fixtures/docs-only__without_skill.md @@ -0,0 +1,14 @@ +# Add quickstart guide + +Adds a quickstart guide to the docs. + +## Changes + +- Added docs/src/content/docs/guides/quickstart.md +- Updated docs/src/content/docs/index.md to link to it + +The PR is trivial -- just docs. No code changes. + +## Build + +The docs build passes. diff --git a/.apm/skills/pr-description-skill/evals/results/.gitkeep b/.apm/skills/pr-description-skill/evals/results/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/.apm/skills/pr-description-skill/evals/triggers.json b/.apm/skills/pr-description-skill/evals/triggers.json new file mode 100644 index 000000000..c5baa891c --- /dev/null +++ b/.apm/skills/pr-description-skill/evals/triggers.json @@ -0,0 +1,131 @@ +{ + "schema_version": 1, + "items": [ + { + "id": "fire-01", + "query": "write a PR description for me", + "expected": "fire", + "split": "train", + "notes": "Verbatim phrase from SKILL.md description." + }, + { + "id": "fire-02", + "query": "draft the PR body", + "expected": "fire", + "split": "train", + "notes": "Verbatim phrase from SKILL.md description." + }, + { + "id": "fire-03", + "query": "open this PR -- can you fill in the template?", + "expected": "fire", + "split": "train", + "notes": "Combined open-PR + fill-template trigger." + }, + { + "id": "fire-04", + "query": "create the PR write-up please", + "expected": "fire", + "split": "train", + "notes": "Verbatim phrase from SKILL.md trigger list." + }, + { + "id": "fire-05", + "query": "summarize this branch as a PR", + "expected": "fire", + "split": "train", + "notes": "Verbatim phrase from SKILL.md trigger list." + }, + { + "id": "fire-06", + "query": "let's open the PR -- write the body", + "expected": "fire", + "split": "val", + "notes": "Open-PR + body, near-domain paraphrase." + }, + { + "id": "fire-07", + "query": "draft a pull request description for this branch", + "expected": "fire", + "split": "val", + "notes": "Long form of 'PR description'." + }, + { + "id": "fire-08", + "query": "pls write the pull request body", + "expected": "fire", + "split": "val", + "notes": "Casual tone, long form." + }, + { + "id": "fire-09", + "query": "fill in the PR template before we ship", + "expected": "fire", + "split": "val", + "notes": "Verbatim phrase wrapped in a longer sentence." + }, + { + "id": "no-fire-01", + "query": "write a commit message for these changes", + "expected": "no_fire", + "split": "train", + "notes": "Adjacent intent (commit, not PR) -- common confusion." + }, + { + "id": "no-fire-02", + "query": "summarize this branch as a release note", + "expected": "no_fire", + "split": "train", + "notes": "Adjacent intent (release notes, not PR body)." + }, + { + "id": "no-fire-03", + "query": "draft a CHANGELOG entry for this work", + "expected": "no_fire", + "split": "train", + "notes": "Changelog work, not PR description." + }, + { + "id": "no-fire-04", + "query": "open an issue describing this bug", + "expected": "no_fire", + "split": "train", + "notes": "Issue, not PR." + }, + { + "id": "no-fire-05", + "query": "review this PR for me", + "expected": "no_fire", + "split": "train", + "notes": "Reviewing != describing." + }, + { + "id": "no-fire-06", + "query": "summarize the diff for me", + "expected": "no_fire", + "split": "val", + "notes": "Diff summary is an INPUT to PR description skill, not the same task." + }, + { + "id": "no-fire-07", + "query": "draft a code review comment", + "expected": "no_fire", + "split": "val", + "notes": "Review comment, not PR body." + }, + { + "id": "no-fire-08", + "query": "write the design doc for this branch", + "expected": "no_fire", + "split": "val", + "notes": "Design doc lives upstream of PR; different artifact." + }, + { + "id": "no-fire-09", + "query": "give me release notes for this milestone", + "expected": "no_fire", + "split": "val", + "notes": "Release notes, not PR body." + } + ] +} diff --git a/.apm/skills/pr-description-skill/scripts/run_evals.py b/.apm/skills/pr-description-skill/scripts/run_evals.py new file mode 100644 index 000000000..c5056aaf1 --- /dev/null +++ b/.apm/skills/pr-description-skill/scripts/run_evals.py @@ -0,0 +1,343 @@ +#!/usr/bin/env python3 +# ASCII-only. Runs the pr-description-skill evals suite. +"""Run pr-description-skill evals. + +Two eval families are exercised: + + * TRIGGER EVALS scoring the SKILL.md `description:` against + should-fire and should-not-fire queries via a deterministic + keyword/bigram matcher. The matcher is documented in the + README; it approximates dispatcher behavior without requiring + a live LLM. + + * CONTENT EVALS scoring pre-recorded `with_skill` and + `without_skill` fixtures against per-scenario regex rubrics. + The delta between the two scores is reported per scenario. + +The runner is non-interactive, stdlib-only, and emits structured +JSON on stdout (machine-readable summary) plus diagnostics on +stderr. Exit codes: + + 0 = all gates met + 1 = one or more gates failed + 2 = runner error (manifest or fixture missing, parse error) + +Run from the worktree root: + + python .apm/skills/pr-description-skill/scripts/run_evals.py + +Use --help for full options. +""" +from __future__ import annotations + +import argparse +import datetime as _dt +import json +import re +import sys +from pathlib import Path +from typing import Any + +SCHEMA_VERSION = 1 +SKILL_DIR = Path(__file__).resolve().parent.parent +EVALS_DIR = SKILL_DIR / "evals" + + +def _log(msg: str) -> None: + print(msg, file=sys.stderr) + + +def _load_json(path: Path) -> dict[str, Any]: + try: + return json.loads(path.read_text(encoding="utf-8")) + except FileNotFoundError: + _log(f"[x] missing file: {path}") + sys.exit(2) + except json.JSONDecodeError as exc: + _log(f"[x] invalid JSON in {path}: {exc}") + sys.exit(2) + + +# --------------------------------------------------------------------------- +# trigger evals +# --------------------------------------------------------------------------- + +def _normalize(text: str) -> str: + return re.sub(r"\s+", " ", text.lower()).strip() + + +def score_trigger(query: str, manifest: dict[str, Any]) -> tuple[bool, dict[str, Any]]: + """Return (predicted_fire, diagnostic) for one query. + + Rule (deterministic dispatcher approximation): + 1. Lowercase + collapse whitespace. + 2. If any phrase from `stop_list` appears verbatim, predict + no_fire (negative override beats everything else). + 3. If any phrase from `trigger_keywords_primary` appears + verbatim, predict fire. + 4. Otherwise count distinct `trigger_keywords_secondary` + tokens present; predict fire iff the count is >= 3 AND + at least one of {pr, pull} is present. + """ + q = _normalize(query) + + for stop in manifest["stop_list"]: + if stop in q: + return False, {"reason": "stop_list_hit", "match": stop} + + for kw in manifest["trigger_keywords_primary"]: + if kw in q: + return True, {"reason": "primary_match", "match": kw} + + sec = manifest["trigger_keywords_secondary"] + hits = sorted({k for k in sec if re.search(rf"\b{re.escape(k)}\b", q)}) + has_pr = any(t in hits for t in ("pr", "pull")) + if has_pr and len(hits) >= 3: + return True, {"reason": "secondary_threshold", "hits": hits} + + return False, {"reason": "no_match", "hits": hits} + + +def run_trigger_evals( + manifest: dict[str, Any], split: str +) -> dict[str, Any]: + triggers_path = EVALS_DIR / manifest["triggers_manifest"] + triggers = _load_json(triggers_path)["items"] + + if split != "all": + triggers = [t for t in triggers if t["split"] == split] + + rows: list[dict[str, Any]] = [] + fire_total = fire_correct = 0 + no_fire_total = no_fire_correct = 0 + + for item in triggers: + predicted_fire, diag = score_trigger(item["query"], manifest) + expected_fire = item["expected"] == "fire" + passed = predicted_fire == expected_fire + + rows.append({ + "id": item["id"], + "split": item["split"], + "query": item["query"], + "expected": item["expected"], + "predicted": "fire" if predicted_fire else "no_fire", + "passed": passed, + "diagnostic": diag, + }) + + if expected_fire: + fire_total += 1 + if passed: + fire_correct += 1 + else: + no_fire_total += 1 + if passed: + no_fire_correct += 1 + + fire_rate = (fire_correct / fire_total) if fire_total else 0.0 + no_fire_rate = (no_fire_correct / no_fire_total) if no_fire_total else 0.0 + + gates = manifest["gates"]["triggers"] + fire_gate_met = fire_rate >= gates["should_fire_rate_min"] + no_fire_gate_met = ( + (no_fire_rate) >= (1.0 - gates["should_not_fire_rate_max"]) + ) + + return { + "split": split, + "should_fire_correct_rate": fire_rate, + "should_not_fire_correct_rate": no_fire_rate, + "should_fire_gate": { + "min": gates["should_fire_rate_min"], + "met": fire_gate_met, + }, + "should_not_fire_gate": { + "max_miss_rate": gates["should_not_fire_rate_max"], + "met": no_fire_gate_met, + }, + "rows": rows, + "passed": fire_gate_met and no_fire_gate_met, + } + + +# --------------------------------------------------------------------------- +# content evals +# --------------------------------------------------------------------------- + +def score_fixture(text: str, rubric: list[dict[str, Any]]) -> dict[str, Any]: + score = 0 + anchors_hit: list[str] = [] + anchors_missed: list[str] = [] + for check in rubric: + pattern = re.compile(check["pattern"]) + match = bool(pattern.search(text)) + weight = int(check["weight"]) + if match: + score += weight + anchors_hit.append(check["id"]) + else: + if weight < 0: + # penalty pattern: not matching is GOOD, no score change + pass + else: + anchors_missed.append(check["id"]) + return {"score": score, "anchors_hit": anchors_hit, "anchors_missed": anchors_missed} + + +def run_content_eval(scenario_path: Path, manifest_root: Path) -> dict[str, Any]: + scenario = _load_json(scenario_path) + + fixtures = scenario["fixtures"] + with_path = (scenario_path.parent / fixtures["with_skill"]).resolve() + without_path = (scenario_path.parent / fixtures["without_skill"]).resolve() + + if not with_path.exists(): + _log(f"[x] missing fixture: {with_path}") + sys.exit(2) + if not without_path.exists(): + _log(f"[x] missing fixture: {without_path}") + sys.exit(2) + + with_text = with_path.read_text(encoding="utf-8") + without_text = without_path.read_text(encoding="utf-8") + + with_score = score_fixture(with_text, scenario["rubric"]) + without_score = score_fixture(without_text, scenario["rubric"]) + + delta_score = with_score["score"] - without_score["score"] + delta_anchors = len( + set(with_score["anchors_hit"]) - set(without_score["anchors_hit"]) + ) + + return { + "id": scenario["id"], + "summary": scenario["summary"], + "with_skill": with_score, + "without_skill": without_score, + "delta_score": delta_score, + "delta_anchors": delta_anchors, + } + + +def run_content_evals(manifest: dict[str, Any]) -> dict[str, Any]: + results: list[dict[str, Any]] = [] + delta_min = int(manifest["gates"]["content"]["delta_min_anchors"]) + all_passed = True + + for rel in manifest["content_manifests"]: + scenario_path = EVALS_DIR / rel + result = run_content_eval(scenario_path, EVALS_DIR) + result["passed"] = result["delta_anchors"] >= delta_min + if not result["passed"]: + all_passed = False + results.append(result) + + return { + "delta_min_anchors": delta_min, + "scenarios": results, + "passed": all_passed, + } + + +# --------------------------------------------------------------------------- +# orchestration +# --------------------------------------------------------------------------- + +def build_parser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser( + prog="run_evals.py", + description="Run pr-description-skill evals (triggers + content).", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + p.add_argument( + "--filter", + choices=("all", "triggers", "content"), + default="all", + help="Eval family to run (default: all).", + ) + p.add_argument( + "--split", + choices=("all", "train", "val"), + default="val", + help="Trigger-eval split to score for the gate (default: val, the ship gate).", + ) + p.add_argument( + "--manifest", + default=str(EVALS_DIR / "evals.json"), + help="Path to evals manifest (default: /evals/evals.json).", + ) + p.add_argument( + "--results-dir", + default=str(EVALS_DIR / "results"), + help="Directory to write timestamped result JSON.", + ) + p.add_argument( + "--quiet", + action="store_true", + help="Suppress stderr diagnostics.", + ) + p.add_argument( + "--no-write", + action="store_true", + help="Do not write a result file to --results-dir.", + ) + return p + + +def main(argv: list[str] | None = None) -> int: + args = build_parser().parse_args(argv) + + if args.quiet: + global _log + _log = lambda msg: None # noqa: E731 + + manifest_path = Path(args.manifest) + manifest = _load_json(manifest_path) + if manifest.get("schema_version") != SCHEMA_VERSION: + _log( + f"[!] schema_version mismatch: manifest={manifest.get('schema_version')} " + f"runner={SCHEMA_VERSION}" + ) + + summary: dict[str, Any] = { + "schema_version": SCHEMA_VERSION, + "skill": manifest.get("skill"), + "timestamp_utc": _dt.datetime.now(tz=_dt.timezone.utc).isoformat(), + "filter": args.filter, + "split": args.split, + } + + overall_passed = True + + if args.filter in ("all", "triggers"): + _log("[>] running trigger evals") + trig = run_trigger_evals(manifest, args.split) + summary["triggers"] = trig + if not trig["passed"]: + overall_passed = False + + if args.filter in ("all", "content"): + _log("[>] running content evals") + cont = run_content_evals(manifest) + summary["content"] = cont + if not cont["passed"]: + overall_passed = False + + summary["passed"] = overall_passed + + if not args.no_write: + results_dir = Path(args.results_dir) + results_dir.mkdir(parents=True, exist_ok=True) + ts = summary["timestamp_utc"].replace(":", "-").replace("+00-00", "Z") + out_path = results_dir / f"{ts}.json" + out_path.write_text(json.dumps(summary, indent=2) + "\n", encoding="utf-8") + summary["result_file"] = str(out_path.relative_to(SKILL_DIR.parent.parent.parent)) + _log(f"[+] wrote {out_path}") + + print(json.dumps(summary, indent=2)) + return 0 if overall_passed else 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/.github/skills/pr-description-skill/evals/.gitignore b/.github/skills/pr-description-skill/evals/.gitignore new file mode 100644 index 000000000..62279c73a --- /dev/null +++ b/.github/skills/pr-description-skill/evals/.gitignore @@ -0,0 +1,2 @@ +# Ignore generated result files; track only the .gitkeep sentinel. +results/*.json diff --git a/.github/skills/pr-description-skill/evals/README.md b/.github/skills/pr-description-skill/evals/README.md new file mode 100644 index 000000000..5860c8d45 --- /dev/null +++ b/.github/skills/pr-description-skill/evals/README.md @@ -0,0 +1,132 @@ +# pr-description-skill evals + +This bundle answers two questions deterministically and without +requiring an LLM API key: + +1. **TRIGGER EVALS**: does the SKILL.md `description:` reliably + match real should-fire intents and avoid near-miss queries? +2. **CONTENT EVALS**: does loading the SKILL.md body change the + shape of a PR description an agent produces, vs not loading + it? + +## Layout + +``` +evals/ + evals.json # top-level manifest (gates, keyword lists) + triggers.json # 18 trigger items (9 fire / 9 no-fire), + # ~60/40 train/val split + content/ + auth-refactor.json # cross-cutting refactor scenario + rubric + docs-only.json # docs-only PR scenario + rubric + dep-bump.json # mechanical dep bump scenario + rubric + fixtures/ + __with_skill.md # representative output produced + # under the SKILL.md guidance + __without_skill.md # representative output produced + # without the skill loaded + results/ + .gitkeep # tracked sentinel + .json # one file per run (gitignored) + README.md # this file +``` + +The runner script lives at +`.apm/skills/pr-description-skill/scripts/run_evals.py`. + +## Run + +From the repo root (or this skill's directory): + +``` +python .apm/skills/pr-description-skill/scripts/run_evals.py +``` + +Common options: + +| Flag | Effect | +|---|---| +| `--filter triggers` | Run only the trigger evals. | +| `--filter content` | Run only the content evals. | +| `--split train` | Score the train split for triggers (default is `val`, the ship gate). | +| `--split all` | Score both splits. | +| `--no-write` | Do not write to `evals/results/`. | +| `--quiet` | Suppress stderr diagnostics. | + +Exit codes: + +* `0` = all gates met +* `1` = one or more gates failed +* `2` = runner error (missing manifest, parse error, missing fixture) + +## Trigger eval scoring + +The runner uses a deterministic dispatcher approximation defined in +`evals.json`: + +1. If any phrase from `stop_list` appears verbatim in the query + (lowercased), predict `no_fire`. +2. Otherwise if any phrase from `trigger_keywords_primary` appears + verbatim, predict `fire`. +3. Otherwise count distinct tokens from + `trigger_keywords_secondary`; predict `fire` iff at least 3 + distinct tokens AND one of `{pr, pull}` is present. + +This is NOT a perfect proxy for an actual LLM dispatcher. It is +fast, deterministic, and CI-friendly. When `gh models` becomes +available, an `--llm` flag can be added that calls the real +dispatcher; the manifest schema already accommodates it. + +Ship gate (validation split): + +* should-fire correctness >= 0.5 +* should-not-fire correctness >= 0.5 (i.e. less than 50% of + near-miss queries leak through as `fire`) + +## Content eval scoring + +Each scenario ships two fixtures: one representing the agent's +output WITH the skill loaded, one representing the same task +WITHOUT it. The same regex rubric scores both. The reported +`delta_anchors` is the count of rubric anchors that fire on +`with_skill` but not on `without_skill`. + +Ship gate: every scenario has `delta_anchors >= 1`. A scenario +with zero delta is a signal that the skill is not adding +measurable value on that shape; the genesis discipline says +redesign or delete. (This runner only flags; deletion is a +human decision.) + +### Fixtures and the LLM-in-the-loop question + +The fixtures are pre-recorded representative outputs. They are +NOT regenerated by an LLM at run time. Two reasons: + +1. **Determinism**: re-running the suite must produce identical + results so regressions are visible. +2. **No required keys**: contributors should be able to run the + suite with only Python stdlib. + +When a maintainer materially changes SKILL.md, the fixtures +SHOULD be regenerated by hand (or by running the agent once on +each scenario, with and without the skill loaded, and pasting +the outputs in). Document the regeneration in the PR that +changes SKILL.md. + +## Adding a new eval + +* **New trigger eval**: append an entry to `triggers.json` with + a fresh `id`, the `query`, the `expected` outcome, and the + `split`. Keep the 60/40 ratio roughly intact. +* **New content scenario**: create `content/.json` (mirror + one of the existing files), add `fixtures/__with_skill.md` + and `fixtures/__without_skill.md`, and append the path to + `content_manifests` in `evals.json`. + +## Encoding + +All eval source files (Python, JSON, README) stay within +printable ASCII per the repo-wide encoding rule. Fixtures are +markdown intended to model PR-body output and MAY contain +Unicode, mirroring the SKILL.md "Output charset rule" -- but the +provided fixtures here remain ASCII for portability. diff --git a/.github/skills/pr-description-skill/evals/content/auth-refactor.json b/.github/skills/pr-description-skill/evals/content/auth-refactor.json new file mode 100644 index 000000000..3e4ae270d --- /dev/null +++ b/.github/skills/pr-description-skill/evals/content/auth-refactor.json @@ -0,0 +1,43 @@ +{ + "schema_version": 1, + "id": "auth-refactor", + "summary": "Cross-cutting auth refactor: token resolution moves into a single AuthResolver; multiple call sites updated; one breaking change to a public CLI flag. PR is non-trivial and exercises every required section.", + "scenario": { + "branch": "refactor/auth-resolver", + "base": "main", + "files_changed": [ + "M src/apm_cli/auth/__init__.py", + "A src/apm_cli/auth/resolver.py", + "M src/apm_cli/cli.py", + "M src/apm_cli/integration/git.py", + "M tests/unit/auth/test_resolver.py", + "M CHANGELOG.md" + ], + "commits": [ + "feat(auth): introduce AuthResolver as single source of token resolution", + "refactor(cli): replace per-host PAT lookup with AuthResolver.get(host)", + "fix(integration/git): drop legacy GITHUB_APM_PAT fallback path", + "docs(changelog): note breaking removal of --token-source flag" + ], + "linked_issue": "#812 -- token resolution scattered across modules; intermittent EMU lookups failing", + "validation_evidence": "uv run pytest tests/unit/auth -x -> 142 passed in 3.4s\napm audit --ci -> 0 findings" + }, + "rubric": [ + {"id": "tldr-present", "pattern": "(?im)^\\s*##?\\s*TL;DR\\b", "weight": 1, "description": "TL;DR section header present"}, + {"id": "tldr-short", "pattern": "(?is)##?\\s*TL;DR.{1,800}?\\n##?\\s", "weight": 1, "description": "TL;DR is short (under 800 chars before next H2)"}, + {"id": "problem-section", "pattern": "(?im)^\\s*##?\\s*Problem\\b", "weight": 1, "description": "Problem (WHY) section present"}, + {"id": "anchored-quote", "pattern": "\\[\"[^\"]{8,}\"\\]\\(https?://", "weight": 2, "description": "At least one verbatim quote anchored to a URL"}, + {"id": "approach-or-implementation", "pattern": "(?im)^\\s*##?\\s*(Approach|Implementation)\\b", "weight": 1, "description": "Approach or Implementation section present"}, + {"id": "mermaid-block", "pattern": "(?s)```mermaid[\\s\\S]+?```", "weight": 2, "description": "At least one mermaid diagram block"}, + {"id": "tradeoffs-section", "pattern": "(?im)^\\s*##?\\s*Trade-?offs?\\b", "weight": 1, "description": "Trade-offs section present"}, + {"id": "validation-section", "pattern": "(?im)^\\s*##?\\s*Validation\\b", "weight": 1, "description": "Validation section present"}, + {"id": "validation-real-output", "pattern": "(?im)pytest|apm audit|uv run", "weight": 1, "description": "Validation shows real command output"}, + {"id": "how-to-test", "pattern": "(?im)^\\s*##?\\s*How to test\\b", "weight": 1, "description": "How to test section present"}, + {"id": "trailer", "pattern": "Co-authored-by: Copilot <223556219", "weight": 1, "description": "Copilot co-author trailer present"}, + {"id": "no-marketing-tone", "pattern": "(?i)significantly enhances|best-in-class|game-changing|revolutionary", "weight": -2, "description": "PENALTY: marketing tone detected"} + ], + "fixtures": { + "with_skill": "../fixtures/auth-refactor__with_skill.md", + "without_skill": "../fixtures/auth-refactor__without_skill.md" + } +} diff --git a/.github/skills/pr-description-skill/evals/content/dep-bump.json b/.github/skills/pr-description-skill/evals/content/dep-bump.json new file mode 100644 index 000000000..1deee05a1 --- /dev/null +++ b/.github/skills/pr-description-skill/evals/content/dep-bump.json @@ -0,0 +1,37 @@ +{ + "schema_version": 1, + "id": "dep-bump", + "summary": "Mechanical dependency upgrade: bump click from 8.1.7 to 8.2.1; lockfile-only changes plus a 1-line type annotation tweak. Tests Trade-offs may be 1-2 bullets and mermaid is optional.", + "scenario": { + "branch": "deps/click-8.2.1", + "base": "main", + "files_changed": [ + "M pyproject.toml", + "M uv.lock", + "M src/apm_cli/cli.py" + ], + "commits": [ + "chore(deps): bump click 8.1.7 -> 8.2.1", + "chore(cli): adjust type hint for new click.Context generic" + ], + "linked_issue": "(none)", + "validation_evidence": "uv sync --extra dev -> resolved in 2.1s\nuv run pytest tests/unit -x -> 2418 passed in 38.7s" + }, + "rubric": [ + {"id": "tldr-present", "pattern": "(?im)^\\s*##?\\s*TL;DR\\b", "weight": 1, "description": "TL;DR present"}, + {"id": "tldr-short", "pattern": "(?is)##?\\s*TL;DR.{1,500}?\\n##?\\s", "weight": 1, "description": "TL;DR very short on a mechanical PR"}, + {"id": "problem-section", "pattern": "(?im)^\\s*##?\\s*Problem\\b", "weight": 1, "description": "Problem section present"}, + {"id": "implementation-section", "pattern": "(?im)^\\s*##?\\s*Implementation\\b", "weight": 1, "description": "Implementation section present"}, + {"id": "tradeoffs-section", "pattern": "(?im)^\\s*##?\\s*Trade-?offs?\\b", "weight": 1, "description": "Trade-offs section present (may be 1-2 bullets)"}, + {"id": "validation-section", "pattern": "(?im)^\\s*##?\\s*Validation\\b", "weight": 1, "description": "Validation section present"}, + {"id": "validation-real-output", "pattern": "(?im)pytest|uv sync|2418 passed", "weight": 1, "description": "Real validation output (pytest counts) shown"}, + {"id": "how-to-test", "pattern": "(?im)^\\s*##?\\s*How to test\\b", "weight": 1, "description": "How to test section present"}, + {"id": "trailer", "pattern": "Co-authored-by: Copilot <223556219", "weight": 1, "description": "Copilot co-author trailer present"}, + {"id": "no-marketing-tone", "pattern": "(?i)significantly enhances|best-in-class|game-changing", "weight": -2, "description": "PENALTY: marketing tone detected"}, + {"id": "no-diff-restate", "pattern": "(?im)^[+-]{3} ", "weight": -1, "description": "PENALTY: diff lines restated in body"} + ], + "fixtures": { + "with_skill": "../fixtures/dep-bump__with_skill.md", + "without_skill": "../fixtures/dep-bump__without_skill.md" + } +} diff --git a/.github/skills/pr-description-skill/evals/content/docs-only.json b/.github/skills/pr-description-skill/evals/content/docs-only.json new file mode 100644 index 000000000..c3bc93c08 --- /dev/null +++ b/.github/skills/pr-description-skill/evals/content/docs-only.json @@ -0,0 +1,34 @@ +{ + "schema_version": 1, + "id": "docs-only", + "summary": "Markdown-only PR adding a quickstart guide. The skill must keep TL;DR, Problem, Validation, How-to-test sections even though the PR is 'trivial' (per SKILL.md gotcha).", + "scenario": { + "branch": "docs/quickstart", + "base": "main", + "files_changed": [ + "A docs/src/content/docs/guides/quickstart.md", + "M docs/src/content/docs/index.md" + ], + "commits": [ + "docs(quickstart): add 5-minute getting-started guide", + "docs(index): link quickstart from landing page" + ], + "linked_issue": "#640 -- new contributors bounce after install; need a 5-minute happy path", + "validation_evidence": "npm run --prefix docs build -> built in 4.2s, no broken links\napm audit --ci -> 0 findings" + }, + "rubric": [ + {"id": "tldr-present", "pattern": "(?im)^\\s*##?\\s*TL;DR\\b", "weight": 1, "description": "TL;DR present even on docs-only PR"}, + {"id": "problem-section", "pattern": "(?im)^\\s*##?\\s*Problem\\b", "weight": 1, "description": "Problem section present even on docs-only PR (per SKILL.md gotcha)"}, + {"id": "anchored-quote", "pattern": "\\[\"[^\"]{8,}\"\\]\\(https?://", "weight": 2, "description": "At least one verbatim anchored quote"}, + {"id": "validation-section", "pattern": "(?im)^\\s*##?\\s*Validation\\b", "weight": 1, "description": "Validation section present"}, + {"id": "validation-build", "pattern": "(?im)npm run|build|broken link", "weight": 1, "description": "Real validation evidence (build/link check) shown"}, + {"id": "how-to-test", "pattern": "(?im)^\\s*##?\\s*How to test\\b", "weight": 1, "description": "How to test present"}, + {"id": "trailer", "pattern": "Co-authored-by: Copilot <223556219", "weight": 1, "description": "Copilot co-author trailer present"}, + {"id": "no-trivial-skip", "pattern": "(?i)the PR is trivial|too small to need", "weight": -2, "description": "PENALTY: skipping sections because PR is small"}, + {"id": "no-marketing-tone", "pattern": "(?i)significantly enhances|best-in-class|game-changing", "weight": -2, "description": "PENALTY: marketing tone detected"} + ], + "fixtures": { + "with_skill": "../fixtures/docs-only__with_skill.md", + "without_skill": "../fixtures/docs-only__without_skill.md" + } +} diff --git a/.github/skills/pr-description-skill/evals/evals.json b/.github/skills/pr-description-skill/evals/evals.json new file mode 100644 index 000000000..f7a8750ea --- /dev/null +++ b/.github/skills/pr-description-skill/evals/evals.json @@ -0,0 +1,62 @@ +{ + "schema_version": 1, + "skill": "pr-description-skill", + "skill_path": "../SKILL.md", + "triggers_manifest": "triggers.json", + "content_manifests": [ + "content/auth-refactor.json", + "content/docs-only.json", + "content/dep-bump.json" + ], + "gates": { + "triggers": { + "split": "val", + "should_fire_rate_min": 0.5, + "should_not_fire_rate_max": 0.5 + }, + "content": { + "delta_min_anchors": 1, + "scope": "every_scenario" + } + }, + "stop_list": [ + "commit message", + "release note", + "release notes", + "changelog", + "open an issue", + "open issue", + "review this pr", + "code review comment", + "design doc", + "summarize the diff" + ], + "trigger_keywords_primary": [ + "pr description", + "pr body", + "pull request description", + "pull request body", + "pr template", + "pr write-up", + "pr writeup", + "draft the pr", + "draft a pr", + "open the pr", + "open a pr", + "summarize this branch as a pr", + "fill in the pr template" + ], + "trigger_keywords_secondary": [ + "pr", + "pull", + "request", + "description", + "body", + "draft", + "open", + "write", + "fill", + "summarize", + "branch" + ] +} diff --git a/.github/skills/pr-description-skill/evals/fixtures/auth-refactor__with_skill.md b/.github/skills/pr-description-skill/evals/fixtures/auth-refactor__with_skill.md new file mode 100644 index 000000000..0078a04a9 --- /dev/null +++ b/.github/skills/pr-description-skill/evals/fixtures/auth-refactor__with_skill.md @@ -0,0 +1,105 @@ +refactor(auth): centralize token resolution behind AuthResolver + +## TL;DR + +Token resolution was scattered across `cli.py`, `integration/git.py`, +and ad-hoc `os.environ` lookups, producing intermittent EMU failures +and a `--token-source` flag whose semantics no two callers agreed +on. This PR introduces a single `AuthResolver` and removes the +flag. One breaking change (flag removal) is documented in the +CHANGELOG. + +## Problem (WHY) + +- Three modules each implemented their own host-to-token mapping, + diverging on EMU host suffixes (`#812`). +- The `--token-source` CLI flag let callers override resolution + per-invocation but was respected only by `cli.py`, silently + ignored by the git integration layer. +- This produced support reports of "auth works in `apm install` + but fails in `apm compile`" for the same machine. + +The skill's PROSE foundation calls this out directly: +["Grounding outputs in deterministic tool execution transforms probabilistic generation into verifiable action."](https://danielmeppiel.github.io/awesome-ai-native/docs/prose/) +-- token resolution that branches by call site is the opposite of +deterministic. + +## Approach (WHAT) + +A single `AuthResolver` owns the host-to-token mapping. Callers +ask `AuthResolver.get(host)` and get a token or a typed error. +The `--token-source` flag is removed; the resolver consults a +fixed precedence (env -> keyring -> gh auth) that is identical +for every caller. + +## Implementation (HOW) + +| File | Change | +|---|---| +| `src/apm_cli/auth/resolver.py` | NEW. `AuthResolver` class with `get(host)`, fixed precedence chain. | +| `src/apm_cli/auth/__init__.py` | Re-export `AuthResolver`. | +| `src/apm_cli/cli.py` | Drop `--token-source`; route all auth lookups through `AuthResolver`. | +| `src/apm_cli/integration/git.py` | Drop legacy `GITHUB_APM_PAT` fallback; delegate to resolver. | +| `tests/unit/auth/test_resolver.py` | New unit tests for resolver precedence and EMU host parsing. | +| `CHANGELOG.md` | Note `--token-source` removal under Breaking Changes. | + +## Diagram + +The resolver becomes the only edge from callers to the token +sources; the legacy fan-out is collapsed. + +```mermaid +flowchart LR + CLI["apm cli command"] --> R["AuthResolver"] + GIT["git integration"] --> R + R --> ENV["environment variables"] + R --> KR["OS keyring"] + R --> GH["gh auth status"] +``` + +## Trade-offs + +- **Removed `--token-source`**: minor break for power users who + scripted around the flag; mitigated by the env-var path which + covers every documented use case. +- **Single precedence chain**: any future per-host policy (e.g. + ADO_APM_PAT for Azure DevOps) plugs in as a new resolver + source, not a new caller-side branch. +- **Rejected**: keeping the flag and routing it through the + resolver. That preserves the bug surface (callers still + passing different flag values) without adding value. + +## Benefits + +1. One source of truth for auth (closes #812). +2. EMU host suffixes parsed in one place. +3. Test surface shrinks from 3 modules to 1. +4. Removes 84 lines of duplicated env-var lookup code. + +## Validation + +
Full test output + +``` +$ uv run pytest tests/unit/auth -x +============================= test session starts ============================= +collected 142 items +tests/unit/auth/test_resolver.py ............................................ +142 passed in 3.4s + +$ apm audit --ci +0 findings +``` + +
+ +## How to test + +- [ ] `uv sync --extra dev` +- [ ] `uv run pytest tests/unit/auth -x` (expect 142 passing) +- [ ] `apm install microsoft/apm/packages/apm-guide` against an + EMU host; confirm the install completes without prompting. +- [ ] Try `apm install --token-source=env`; expect a clear + "unknown option" error (flag is gone). + +Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> diff --git a/.github/skills/pr-description-skill/evals/fixtures/auth-refactor__without_skill.md b/.github/skills/pr-description-skill/evals/fixtures/auth-refactor__without_skill.md new file mode 100644 index 000000000..10b6d2119 --- /dev/null +++ b/.github/skills/pr-description-skill/evals/fixtures/auth-refactor__without_skill.md @@ -0,0 +1,32 @@ +# Auth refactor + +This PR refactors authentication. It centralizes token resolution +into a new AuthResolver class and removes the --token-source flag. + +## What changed + +- Added a new file src/apm_cli/auth/resolver.py with the + AuthResolver class. +- Updated src/apm_cli/auth/__init__.py to export AuthResolver. +- Updated src/apm_cli/cli.py to use AuthResolver for all token + lookups and removed the --token-source flag. +- Updated src/apm_cli/integration/git.py to delegate to + AuthResolver and dropped the GITHUB_APM_PAT fallback. +- Added unit tests in tests/unit/auth/test_resolver.py. +- Updated CHANGELOG.md. + +## Why + +The previous code had token resolution logic scattered across +multiple modules. This was fragile and led to bugs. Centralizing +it makes the code cleaner and easier to maintain. This is a +significantly enhanced approach to authentication. + +## Testing + +I ran the tests and they passed. The audit also passed. + +## Notes + +This is a breaking change because the --token-source flag is +removed. diff --git a/.github/skills/pr-description-skill/evals/fixtures/dep-bump__with_skill.md b/.github/skills/pr-description-skill/evals/fixtures/dep-bump__with_skill.md new file mode 100644 index 000000000..06d48f629 --- /dev/null +++ b/.github/skills/pr-description-skill/evals/fixtures/dep-bump__with_skill.md @@ -0,0 +1,52 @@ +chore(deps): bump click 8.1.7 -> 8.2.1 + +## TL;DR + +Routine click upgrade. Picks up the new `Context` generic typing +and a fix for boolean-flag short forms. One type hint adjusted in +`cli.py`. Lockfile regenerated; full unit suite green. + +## Problem (WHY) + +- click 8.2.x ships a typing fix flagged by mypy on Python 3.12. +- Staying on 8.1.7 means our type checks fail under the supported + Python matrix. + +The skill's anchor for cite-or-omit applies here too: +["Add what the agent lacks, omit what it knows"](https://agentskills.io/skill-creation/best-practices) +-- this PR body says only what the diff alone cannot. + +## Approach (WHAT) + +Mechanical. Bump version pin, regenerate lockfile, adjust one +type hint. + +## Implementation (HOW) + +| File | Change | +|---|---| +| `pyproject.toml` | `click = "^8.2.1"` (was `^8.1.7`). | +| `uv.lock` | Regenerated by `uv sync`. | +| `src/apm_cli/cli.py` | One type hint updated for new `click.Context` generic. | + +## Trade-offs + +- **Direct bump vs caret-locking to 8.1**: chose direct bump + because the typing fix is the reason for the upgrade. + +## Validation + +``` +$ uv sync --extra dev +resolved in 2.1s +$ uv run pytest tests/unit -x +2418 passed in 38.7s +``` + +## How to test + +- [ ] `uv sync --extra dev` +- [ ] `uv run pytest tests/unit -x` (expect 2418 passing) +- [ ] `uv run mypy src/apm_cli` (expect 0 errors) + +Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> diff --git a/.github/skills/pr-description-skill/evals/fixtures/dep-bump__without_skill.md b/.github/skills/pr-description-skill/evals/fixtures/dep-bump__without_skill.md new file mode 100644 index 000000000..21ccd2594 --- /dev/null +++ b/.github/skills/pr-description-skill/evals/fixtures/dep-bump__without_skill.md @@ -0,0 +1,13 @@ +# Bump click + +This bumps click from 8.1.7 to 8.2.1. + +I also adjusted a type hint in cli.py because click.Context is +now generic. + +The lockfile was regenerated. + +This is a game-changing dependency upgrade for our type-checking +story. + +Tests pass. diff --git a/.github/skills/pr-description-skill/evals/fixtures/docs-only__with_skill.md b/.github/skills/pr-description-skill/evals/fixtures/docs-only__with_skill.md new file mode 100644 index 000000000..382f5470b --- /dev/null +++ b/.github/skills/pr-description-skill/evals/fixtures/docs-only__with_skill.md @@ -0,0 +1,63 @@ +docs(quickstart): add 5-minute getting-started guide + +## TL;DR + +New contributors land on the docs index, scroll, and bounce. +This PR adds a 5-minute quickstart linked from the landing page +so first-touch users have a happy path before reading reference +material. + +## Problem (WHY) + +- Issue #640 reports a 60% bounce rate on `/docs/` for new + visitors. +- The current landing page lists every guide in alphabetical + order; "Quickstart" did not exist. +- Agent Skills makes the diagnosis explicit: + ["agents pattern-match well against concrete structures"](https://agentskills.io/skill-creation/best-practices) + -- and the same is true for human readers landing cold. + +## Approach (WHAT) + +Additive: see Implementation. One new page, one link added on +the index. + +## Implementation (HOW) + +| File | Change | +|---|---| +| `docs/src/content/docs/guides/quickstart.md` | NEW. 5-minute happy path: install, init, run. | +| `docs/src/content/docs/index.md` | Promote quickstart link above the alphabetical guide list. | + +## Trade-offs + +- **Promoting one guide above the list**: minor visual + asymmetry; pays for itself if bounce rate drops. +- **Rejected**: rewriting the landing page. Higher risk, larger + diff, harder to A/B. + +## Benefits + +1. Closes #640 if bounce rate drops measurably. +2. New contributor first-touch falls under 5 minutes. +3. Zero code change; rollback is `git revert`. + +## Validation + +``` +$ npm run --prefix docs build +built in 4.2s +no broken links +$ apm audit --ci +0 findings +``` + +## How to test + +- [ ] `npm run --prefix docs dev` +- [ ] Open `http://localhost:4321/docs/`; confirm Quickstart + link is above the fold. +- [ ] Click through; confirm the 5 steps complete on a fresh + clone in under 5 minutes. + +Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> diff --git a/.github/skills/pr-description-skill/evals/fixtures/docs-only__without_skill.md b/.github/skills/pr-description-skill/evals/fixtures/docs-only__without_skill.md new file mode 100644 index 000000000..5dddbc2fc --- /dev/null +++ b/.github/skills/pr-description-skill/evals/fixtures/docs-only__without_skill.md @@ -0,0 +1,14 @@ +# Add quickstart guide + +Adds a quickstart guide to the docs. + +## Changes + +- Added docs/src/content/docs/guides/quickstart.md +- Updated docs/src/content/docs/index.md to link to it + +The PR is trivial -- just docs. No code changes. + +## Build + +The docs build passes. diff --git a/.github/skills/pr-description-skill/evals/results/.gitkeep b/.github/skills/pr-description-skill/evals/results/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/.github/skills/pr-description-skill/evals/triggers.json b/.github/skills/pr-description-skill/evals/triggers.json new file mode 100644 index 000000000..c5baa891c --- /dev/null +++ b/.github/skills/pr-description-skill/evals/triggers.json @@ -0,0 +1,131 @@ +{ + "schema_version": 1, + "items": [ + { + "id": "fire-01", + "query": "write a PR description for me", + "expected": "fire", + "split": "train", + "notes": "Verbatim phrase from SKILL.md description." + }, + { + "id": "fire-02", + "query": "draft the PR body", + "expected": "fire", + "split": "train", + "notes": "Verbatim phrase from SKILL.md description." + }, + { + "id": "fire-03", + "query": "open this PR -- can you fill in the template?", + "expected": "fire", + "split": "train", + "notes": "Combined open-PR + fill-template trigger." + }, + { + "id": "fire-04", + "query": "create the PR write-up please", + "expected": "fire", + "split": "train", + "notes": "Verbatim phrase from SKILL.md trigger list." + }, + { + "id": "fire-05", + "query": "summarize this branch as a PR", + "expected": "fire", + "split": "train", + "notes": "Verbatim phrase from SKILL.md trigger list." + }, + { + "id": "fire-06", + "query": "let's open the PR -- write the body", + "expected": "fire", + "split": "val", + "notes": "Open-PR + body, near-domain paraphrase." + }, + { + "id": "fire-07", + "query": "draft a pull request description for this branch", + "expected": "fire", + "split": "val", + "notes": "Long form of 'PR description'." + }, + { + "id": "fire-08", + "query": "pls write the pull request body", + "expected": "fire", + "split": "val", + "notes": "Casual tone, long form." + }, + { + "id": "fire-09", + "query": "fill in the PR template before we ship", + "expected": "fire", + "split": "val", + "notes": "Verbatim phrase wrapped in a longer sentence." + }, + { + "id": "no-fire-01", + "query": "write a commit message for these changes", + "expected": "no_fire", + "split": "train", + "notes": "Adjacent intent (commit, not PR) -- common confusion." + }, + { + "id": "no-fire-02", + "query": "summarize this branch as a release note", + "expected": "no_fire", + "split": "train", + "notes": "Adjacent intent (release notes, not PR body)." + }, + { + "id": "no-fire-03", + "query": "draft a CHANGELOG entry for this work", + "expected": "no_fire", + "split": "train", + "notes": "Changelog work, not PR description." + }, + { + "id": "no-fire-04", + "query": "open an issue describing this bug", + "expected": "no_fire", + "split": "train", + "notes": "Issue, not PR." + }, + { + "id": "no-fire-05", + "query": "review this PR for me", + "expected": "no_fire", + "split": "train", + "notes": "Reviewing != describing." + }, + { + "id": "no-fire-06", + "query": "summarize the diff for me", + "expected": "no_fire", + "split": "val", + "notes": "Diff summary is an INPUT to PR description skill, not the same task." + }, + { + "id": "no-fire-07", + "query": "draft a code review comment", + "expected": "no_fire", + "split": "val", + "notes": "Review comment, not PR body." + }, + { + "id": "no-fire-08", + "query": "write the design doc for this branch", + "expected": "no_fire", + "split": "val", + "notes": "Design doc lives upstream of PR; different artifact." + }, + { + "id": "no-fire-09", + "query": "give me release notes for this milestone", + "expected": "no_fire", + "split": "val", + "notes": "Release notes, not PR body." + } + ] +} diff --git a/.github/skills/pr-description-skill/scripts/run_evals.py b/.github/skills/pr-description-skill/scripts/run_evals.py new file mode 100644 index 000000000..c5056aaf1 --- /dev/null +++ b/.github/skills/pr-description-skill/scripts/run_evals.py @@ -0,0 +1,343 @@ +#!/usr/bin/env python3 +# ASCII-only. Runs the pr-description-skill evals suite. +"""Run pr-description-skill evals. + +Two eval families are exercised: + + * TRIGGER EVALS scoring the SKILL.md `description:` against + should-fire and should-not-fire queries via a deterministic + keyword/bigram matcher. The matcher is documented in the + README; it approximates dispatcher behavior without requiring + a live LLM. + + * CONTENT EVALS scoring pre-recorded `with_skill` and + `without_skill` fixtures against per-scenario regex rubrics. + The delta between the two scores is reported per scenario. + +The runner is non-interactive, stdlib-only, and emits structured +JSON on stdout (machine-readable summary) plus diagnostics on +stderr. Exit codes: + + 0 = all gates met + 1 = one or more gates failed + 2 = runner error (manifest or fixture missing, parse error) + +Run from the worktree root: + + python .apm/skills/pr-description-skill/scripts/run_evals.py + +Use --help for full options. +""" +from __future__ import annotations + +import argparse +import datetime as _dt +import json +import re +import sys +from pathlib import Path +from typing import Any + +SCHEMA_VERSION = 1 +SKILL_DIR = Path(__file__).resolve().parent.parent +EVALS_DIR = SKILL_DIR / "evals" + + +def _log(msg: str) -> None: + print(msg, file=sys.stderr) + + +def _load_json(path: Path) -> dict[str, Any]: + try: + return json.loads(path.read_text(encoding="utf-8")) + except FileNotFoundError: + _log(f"[x] missing file: {path}") + sys.exit(2) + except json.JSONDecodeError as exc: + _log(f"[x] invalid JSON in {path}: {exc}") + sys.exit(2) + + +# --------------------------------------------------------------------------- +# trigger evals +# --------------------------------------------------------------------------- + +def _normalize(text: str) -> str: + return re.sub(r"\s+", " ", text.lower()).strip() + + +def score_trigger(query: str, manifest: dict[str, Any]) -> tuple[bool, dict[str, Any]]: + """Return (predicted_fire, diagnostic) for one query. + + Rule (deterministic dispatcher approximation): + 1. Lowercase + collapse whitespace. + 2. If any phrase from `stop_list` appears verbatim, predict + no_fire (negative override beats everything else). + 3. If any phrase from `trigger_keywords_primary` appears + verbatim, predict fire. + 4. Otherwise count distinct `trigger_keywords_secondary` + tokens present; predict fire iff the count is >= 3 AND + at least one of {pr, pull} is present. + """ + q = _normalize(query) + + for stop in manifest["stop_list"]: + if stop in q: + return False, {"reason": "stop_list_hit", "match": stop} + + for kw in manifest["trigger_keywords_primary"]: + if kw in q: + return True, {"reason": "primary_match", "match": kw} + + sec = manifest["trigger_keywords_secondary"] + hits = sorted({k for k in sec if re.search(rf"\b{re.escape(k)}\b", q)}) + has_pr = any(t in hits for t in ("pr", "pull")) + if has_pr and len(hits) >= 3: + return True, {"reason": "secondary_threshold", "hits": hits} + + return False, {"reason": "no_match", "hits": hits} + + +def run_trigger_evals( + manifest: dict[str, Any], split: str +) -> dict[str, Any]: + triggers_path = EVALS_DIR / manifest["triggers_manifest"] + triggers = _load_json(triggers_path)["items"] + + if split != "all": + triggers = [t for t in triggers if t["split"] == split] + + rows: list[dict[str, Any]] = [] + fire_total = fire_correct = 0 + no_fire_total = no_fire_correct = 0 + + for item in triggers: + predicted_fire, diag = score_trigger(item["query"], manifest) + expected_fire = item["expected"] == "fire" + passed = predicted_fire == expected_fire + + rows.append({ + "id": item["id"], + "split": item["split"], + "query": item["query"], + "expected": item["expected"], + "predicted": "fire" if predicted_fire else "no_fire", + "passed": passed, + "diagnostic": diag, + }) + + if expected_fire: + fire_total += 1 + if passed: + fire_correct += 1 + else: + no_fire_total += 1 + if passed: + no_fire_correct += 1 + + fire_rate = (fire_correct / fire_total) if fire_total else 0.0 + no_fire_rate = (no_fire_correct / no_fire_total) if no_fire_total else 0.0 + + gates = manifest["gates"]["triggers"] + fire_gate_met = fire_rate >= gates["should_fire_rate_min"] + no_fire_gate_met = ( + (no_fire_rate) >= (1.0 - gates["should_not_fire_rate_max"]) + ) + + return { + "split": split, + "should_fire_correct_rate": fire_rate, + "should_not_fire_correct_rate": no_fire_rate, + "should_fire_gate": { + "min": gates["should_fire_rate_min"], + "met": fire_gate_met, + }, + "should_not_fire_gate": { + "max_miss_rate": gates["should_not_fire_rate_max"], + "met": no_fire_gate_met, + }, + "rows": rows, + "passed": fire_gate_met and no_fire_gate_met, + } + + +# --------------------------------------------------------------------------- +# content evals +# --------------------------------------------------------------------------- + +def score_fixture(text: str, rubric: list[dict[str, Any]]) -> dict[str, Any]: + score = 0 + anchors_hit: list[str] = [] + anchors_missed: list[str] = [] + for check in rubric: + pattern = re.compile(check["pattern"]) + match = bool(pattern.search(text)) + weight = int(check["weight"]) + if match: + score += weight + anchors_hit.append(check["id"]) + else: + if weight < 0: + # penalty pattern: not matching is GOOD, no score change + pass + else: + anchors_missed.append(check["id"]) + return {"score": score, "anchors_hit": anchors_hit, "anchors_missed": anchors_missed} + + +def run_content_eval(scenario_path: Path, manifest_root: Path) -> dict[str, Any]: + scenario = _load_json(scenario_path) + + fixtures = scenario["fixtures"] + with_path = (scenario_path.parent / fixtures["with_skill"]).resolve() + without_path = (scenario_path.parent / fixtures["without_skill"]).resolve() + + if not with_path.exists(): + _log(f"[x] missing fixture: {with_path}") + sys.exit(2) + if not without_path.exists(): + _log(f"[x] missing fixture: {without_path}") + sys.exit(2) + + with_text = with_path.read_text(encoding="utf-8") + without_text = without_path.read_text(encoding="utf-8") + + with_score = score_fixture(with_text, scenario["rubric"]) + without_score = score_fixture(without_text, scenario["rubric"]) + + delta_score = with_score["score"] - without_score["score"] + delta_anchors = len( + set(with_score["anchors_hit"]) - set(without_score["anchors_hit"]) + ) + + return { + "id": scenario["id"], + "summary": scenario["summary"], + "with_skill": with_score, + "without_skill": without_score, + "delta_score": delta_score, + "delta_anchors": delta_anchors, + } + + +def run_content_evals(manifest: dict[str, Any]) -> dict[str, Any]: + results: list[dict[str, Any]] = [] + delta_min = int(manifest["gates"]["content"]["delta_min_anchors"]) + all_passed = True + + for rel in manifest["content_manifests"]: + scenario_path = EVALS_DIR / rel + result = run_content_eval(scenario_path, EVALS_DIR) + result["passed"] = result["delta_anchors"] >= delta_min + if not result["passed"]: + all_passed = False + results.append(result) + + return { + "delta_min_anchors": delta_min, + "scenarios": results, + "passed": all_passed, + } + + +# --------------------------------------------------------------------------- +# orchestration +# --------------------------------------------------------------------------- + +def build_parser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser( + prog="run_evals.py", + description="Run pr-description-skill evals (triggers + content).", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + p.add_argument( + "--filter", + choices=("all", "triggers", "content"), + default="all", + help="Eval family to run (default: all).", + ) + p.add_argument( + "--split", + choices=("all", "train", "val"), + default="val", + help="Trigger-eval split to score for the gate (default: val, the ship gate).", + ) + p.add_argument( + "--manifest", + default=str(EVALS_DIR / "evals.json"), + help="Path to evals manifest (default: /evals/evals.json).", + ) + p.add_argument( + "--results-dir", + default=str(EVALS_DIR / "results"), + help="Directory to write timestamped result JSON.", + ) + p.add_argument( + "--quiet", + action="store_true", + help="Suppress stderr diagnostics.", + ) + p.add_argument( + "--no-write", + action="store_true", + help="Do not write a result file to --results-dir.", + ) + return p + + +def main(argv: list[str] | None = None) -> int: + args = build_parser().parse_args(argv) + + if args.quiet: + global _log + _log = lambda msg: None # noqa: E731 + + manifest_path = Path(args.manifest) + manifest = _load_json(manifest_path) + if manifest.get("schema_version") != SCHEMA_VERSION: + _log( + f"[!] schema_version mismatch: manifest={manifest.get('schema_version')} " + f"runner={SCHEMA_VERSION}" + ) + + summary: dict[str, Any] = { + "schema_version": SCHEMA_VERSION, + "skill": manifest.get("skill"), + "timestamp_utc": _dt.datetime.now(tz=_dt.timezone.utc).isoformat(), + "filter": args.filter, + "split": args.split, + } + + overall_passed = True + + if args.filter in ("all", "triggers"): + _log("[>] running trigger evals") + trig = run_trigger_evals(manifest, args.split) + summary["triggers"] = trig + if not trig["passed"]: + overall_passed = False + + if args.filter in ("all", "content"): + _log("[>] running content evals") + cont = run_content_evals(manifest) + summary["content"] = cont + if not cont["passed"]: + overall_passed = False + + summary["passed"] = overall_passed + + if not args.no_write: + results_dir = Path(args.results_dir) + results_dir.mkdir(parents=True, exist_ok=True) + ts = summary["timestamp_utc"].replace(":", "-").replace("+00-00", "Z") + out_path = results_dir / f"{ts}.json" + out_path.write_text(json.dumps(summary, indent=2) + "\n", encoding="utf-8") + summary["result_file"] = str(out_path.relative_to(SKILL_DIR.parent.parent.parent)) + _log(f"[+] wrote {out_path}") + + print(json.dumps(summary, indent=2)) + return 0 if overall_passed else 1 + + +if __name__ == "__main__": + sys.exit(main())