From 06bbd9bf7eac93f52fe83d15ceb37c022deb8dd8 Mon Sep 17 00:00:00 2001
From: Daniel Meppiel <danielmeppiel@github.com>
Date: Mon, 27 Apr 2026 11:02:33 +0200
Subject: [PATCH] Add evals suite to pr-description-skill (genesis-driven)

Adds a self-contained evals/ bundle and a non-interactive runner
under .apm/skills/pr-description-skill/. Mirror regenerated via
apm install --target copilot.

- evals/evals.json: top-level manifest + ship gates
- evals/triggers.json: 18 trigger items (9 fire / 9 no-fire),
  60/40 train/val split
- evals/content/*.json: 3 PR-shape scenarios (cross-cutting
  refactor, docs-only, mechanical dep bump) each with regex
  rubric and with_skill/without_skill fixtures
- evals/fixtures/*.md: pre-recorded sample outputs for the two
  conditions; deterministic, no LLM API required at runtime
- scripts/run_evals.py: stdlib-only runner; --filter, --split,
  --quiet, --no-write, --help; JSON on stdout, diagnostics on
  stderr, exit codes 0/1/2
- evals/results/.gitkeep + evals/.gitignore: track the dir,
  ignore generated result JSON
- evals/README.md: how to run, scoring rules, gates, how to
  add evals, regeneration policy

Genesis handoff packet persisted out-of-tree at the session
state dir; design ends at step 6, this commit is step 7b output.

Validation (val split):
- triggers should-fire correctness = 1.0 (gate >= 0.5)
- triggers should-not-fire correctness = 1.0 (gate >= 0.5)
- content delta_anchors per scenario = 11, 6, 9 (gate >= 1)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../pr-description-skill/evals/.gitignore     |   2 +
 .../pr-description-skill/evals/README.md      | 132 +++++++
 .../evals/content/auth-refactor.json          |  43 +++
 .../evals/content/dep-bump.json               |  37 ++
 .../evals/content/docs-only.json              |  34 ++
 .../pr-description-skill/evals/evals.json     |  62 ++++
 .../fixtures/auth-refactor__with_skill.md     | 105 ++++++
 .../fixtures/auth-refactor__without_skill.md  |  32 ++
 .../evals/fixtures/dep-bump__with_skill.md    |  52 +++
 .../evals/fixtures/dep-bump__without_skill.md |  13 +
 .../evals/fixtures/docs-only__with_skill.md   |  63 ++++
 .../fixtures/docs-only__without_skill.md      |  14 +
 .../evals/results/.gitkeep                    |   0
 .../pr-description-skill/evals/triggers.json  | 131 +++++++
 .../pr-description-skill/scripts/run_evals.py | 343 ++++++++++++++++++
 .../pr-description-skill/evals/.gitignore     |   2 +
 .../pr-description-skill/evals/README.md      | 132 +++++++
 .../evals/content/auth-refactor.json          |  43 +++
 .../evals/content/dep-bump.json               |  37 ++
 .../evals/content/docs-only.json              |  34 ++
 .../pr-description-skill/evals/evals.json     |  62 ++++
 .../fixtures/auth-refactor__with_skill.md     | 105 ++++++
 .../fixtures/auth-refactor__without_skill.md  |  32 ++
 .../evals/fixtures/dep-bump__with_skill.md    |  52 +++
 .../evals/fixtures/dep-bump__without_skill.md |  13 +
 .../evals/fixtures/docs-only__with_skill.md   |  63 ++++
 .../fixtures/docs-only__without_skill.md      |  14 +
 .../evals/results/.gitkeep                    |   0
 .../pr-description-skill/evals/triggers.json  | 131 +++++++
 .../pr-description-skill/scripts/run_evals.py | 343 ++++++++++++++++++
 30 files changed, 2126 insertions(+)
 create mode 100644 .apm/skills/pr-description-skill/evals/.gitignore
 create mode 100644 .apm/skills/pr-description-skill/evals/README.md
 create mode 100644 .apm/skills/pr-description-skill/evals/content/auth-refactor.json
 create mode 100644 .apm/skills/pr-description-skill/evals/content/dep-bump.json
 create mode 100644 .apm/skills/pr-description-skill/evals/content/docs-only.json
 create mode 100644 .apm/skills/pr-description-skill/evals/evals.json
 create mode 100644 .apm/skills/pr-description-skill/evals/fixtures/auth-refactor__with_skill.md
 create mode 100644 .apm/skills/pr-description-skill/evals/fixtures/auth-refactor__without_skill.md
 create mode 100644 .apm/skills/pr-description-skill/evals/fixtures/dep-bump__with_skill.md
 create mode 100644 .apm/skills/pr-description-skill/evals/fixtures/dep-bump__without_skill.md
 create mode 100644 .apm/skills/pr-description-skill/evals/fixtures/docs-only__with_skill.md
 create mode 100644 .apm/skills/pr-description-skill/evals/fixtures/docs-only__without_skill.md
 create mode 100644 .apm/skills/pr-description-skill/evals/results/.gitkeep
 create mode 100644 .apm/skills/pr-description-skill/evals/triggers.json
 create mode 100644 .apm/skills/pr-description-skill/scripts/run_evals.py
 create mode 100644 .github/skills/pr-description-skill/evals/.gitignore
 create mode 100644 .github/skills/pr-description-skill/evals/README.md
 create mode 100644 .github/skills/pr-description-skill/evals/content/auth-refactor.json
 create mode 100644 .github/skills/pr-description-skill/evals/content/dep-bump.json
 create mode 100644 .github/skills/pr-description-skill/evals/content/docs-only.json
 create mode 100644 .github/skills/pr-description-skill/evals/evals.json
 create mode 100644 .github/skills/pr-description-skill/evals/fixtures/auth-refactor__with_skill.md
 create mode 100644 .github/skills/pr-description-skill/evals/fixtures/auth-refactor__without_skill.md
 create mode 100644 .github/skills/pr-description-skill/evals/fixtures/dep-bump__with_skill.md
 create mode 100644 .github/skills/pr-description-skill/evals/fixtures/dep-bump__without_skill.md
 create mode 100644 .github/skills/pr-description-skill/evals/fixtures/docs-only__with_skill.md
 create mode 100644 .github/skills/pr-description-skill/evals/fixtures/docs-only__without_skill.md
 create mode 100644 .github/skills/pr-description-skill/evals/results/.gitkeep
 create mode 100644 .github/skills/pr-description-skill/evals/triggers.json
 create mode 100644 .github/skills/pr-description-skill/scripts/run_evals.py

diff --git a/.apm/skills/pr-description-skill/evals/.gitignore b/.apm/skills/pr-description-skill/evals/.gitignore
new file mode 100644
index 000000000..62279c73a
--- /dev/null
+++ b/.apm/skills/pr-description-skill/evals/.gitignore
@@ -0,0 +1,2 @@
+# Ignore generated result files; track only the .gitkeep sentinel.
+results/*.json
diff --git a/.apm/skills/pr-description-skill/evals/README.md b/.apm/skills/pr-description-skill/evals/README.md
new file mode 100644
index 000000000..5860c8d45
--- /dev/null
+++ b/.apm/skills/pr-description-skill/evals/README.md
@@ -0,0 +1,132 @@
+# pr-description-skill evals
+
+This bundle answers two questions deterministically and without
+requiring an LLM API key:
+
+1. **TRIGGER EVALS**: does the SKILL.md `description:` reliably
+   match real should-fire intents and avoid near-miss queries?
+2. **CONTENT EVALS**: does loading the SKILL.md body change the
+   shape of a PR description an agent produces, vs not loading
+   it?
+
+## Layout
+
+```
+evals/
+  evals.json            # top-level manifest (gates, keyword lists)
+  triggers.json         # 18 trigger items (9 fire / 9 no-fire),
+                        # ~60/40 train/val split
+  content/
+    auth-refactor.json  # cross-cutting refactor scenario + rubric
+    docs-only.json      # docs-only PR scenario + rubric
+    dep-bump.json       # mechanical dep bump scenario + rubric
+  fixtures/
+    <id>__with_skill.md      # representative output produced
+                              # under the SKILL.md guidance
+    <id>__without_skill.md   # representative output produced
+                              # without the skill loaded
+  results/
+    .gitkeep            # tracked sentinel
+    <UTC-iso>.json      # one file per run (gitignored)
+  README.md             # this file
+```
+
+The runner script lives at
+`.apm/skills/pr-description-skill/scripts/run_evals.py`.
+
+## Run
+
+From the repo root (or this skill's directory):
+
+```
+python .apm/skills/pr-description-skill/scripts/run_evals.py
+```
+
+Common options:
+
+| Flag | Effect |
+|---|---|
+| `--filter triggers` | Run only the trigger evals. |
+| `--filter content` | Run only the content evals. |
+| `--split train` | Score the train split for triggers (default is `val`, the ship gate). |
+| `--split all` | Score both splits. |
+| `--no-write` | Do not write to `evals/results/`. |
+| `--quiet` | Suppress stderr diagnostics. |
+
+Exit codes:
+
+* `0` = all gates met
+* `1` = one or more gates failed
+* `2` = runner error (missing manifest, parse error, missing fixture)
+
+## Trigger eval scoring
+
+The runner uses a deterministic dispatcher approximation defined in
+`evals.json`:
+
+1. If any phrase from `stop_list` appears verbatim in the query
+   (lowercased), predict `no_fire`.
+2. Otherwise if any phrase from `trigger_keywords_primary` appears
+   verbatim, predict `fire`.
+3. Otherwise count distinct tokens from
+   `trigger_keywords_secondary`; predict `fire` iff at least 3
+   distinct tokens AND one of `{pr, pull}` is present.
+
+This is NOT a perfect proxy for an actual LLM dispatcher. It is
+fast, deterministic, and CI-friendly. When `gh models` becomes
+available, an `--llm` flag can be added that calls the real
+dispatcher; the manifest schema already accommodates it.
+
+Ship gate (validation split):
+
+* should-fire correctness >= 0.5
+* should-not-fire correctness >= 0.5 (i.e. less than 50% of
+  near-miss queries leak through as `fire`)
+
+## Content eval scoring
+
+Each scenario ships two fixtures: one representing the agent's
+output WITH the skill loaded, one representing the same task
+WITHOUT it. The same regex rubric scores both. The reported
+`delta_anchors` is the count of rubric anchors that fire on
+`with_skill` but not on `without_skill`.
+
+Ship gate: every scenario has `delta_anchors >= 1`. A scenario
+with zero delta is a signal that the skill is not adding
+measurable value on that shape; the genesis discipline says
+redesign or delete. (This runner only flags; deletion is a
+human decision.)
+
+### Fixtures and the LLM-in-the-loop question
+
+The fixtures are pre-recorded representative outputs. They are
+NOT regenerated by an LLM at run time. Two reasons:
+
+1. **Determinism**: re-running the suite must produce identical
+   results so regressions are visible.
+2. **No required keys**: contributors should be able to run the
+   suite with only Python stdlib.
+
+When a maintainer materially changes SKILL.md, the fixtures
+SHOULD be regenerated by hand (or by running the agent once on
+each scenario, with and without the skill loaded, and pasting
+the outputs in). Document the regeneration in the PR that
+changes SKILL.md.
+
+## Adding a new eval
+
+* **New trigger eval**: append an entry to `triggers.json` with
+  a fresh `id`, the `query`, the `expected` outcome, and the
+  `split`. Keep the 60/40 ratio roughly intact.
+* **New content scenario**: create `content/<id>.json` (mirror
+  one of the existing files), add `fixtures/<id>__with_skill.md`
+  and `fixtures/<id>__without_skill.md`, and append the path to
+  `content_manifests` in `evals.json`.
+
+## Encoding
+
+All eval source files (Python, JSON, README) stay within
+printable ASCII per the repo-wide encoding rule. Fixtures are
+markdown intended to model PR-body output and MAY contain
+Unicode, mirroring the SKILL.md "Output charset rule" -- but the
+provided fixtures here remain ASCII for portability.
diff --git a/.apm/skills/pr-description-skill/evals/content/auth-refactor.json b/.apm/skills/pr-description-skill/evals/content/auth-refactor.json
new file mode 100644
index 000000000..3e4ae270d
--- /dev/null
+++ b/.apm/skills/pr-description-skill/evals/content/auth-refactor.json
@@ -0,0 +1,43 @@
+{
+  "schema_version": 1,
+  "id": "auth-refactor",
+  "summary": "Cross-cutting auth refactor: token resolution moves into a single AuthResolver; multiple call sites updated; one breaking change to a public CLI flag. PR is non-trivial and exercises every required section.",
+  "scenario": {
+    "branch": "refactor/auth-resolver",
+    "base": "main",
+    "files_changed": [
+      "M src/apm_cli/auth/__init__.py",
+      "A src/apm_cli/auth/resolver.py",
+      "M src/apm_cli/cli.py",
+      "M src/apm_cli/integration/git.py",
+      "M tests/unit/auth/test_resolver.py",
+      "M CHANGELOG.md"
+    ],
+    "commits": [
+      "feat(auth): introduce AuthResolver as single source of token resolution",
+      "refactor(cli): replace per-host PAT lookup with AuthResolver.get(host)",
+      "fix(integration/git): drop legacy GITHUB_APM_PAT fallback path",
+      "docs(changelog): note breaking removal of --token-source flag"
+    ],
+    "linked_issue": "#812 -- token resolution scattered across modules; intermittent EMU lookups failing",
+    "validation_evidence": "uv run pytest tests/unit/auth -x  ->  142 passed in 3.4s\napm audit --ci  ->  0 findings"
+  },
+  "rubric": [
+    {"id": "tldr-present", "pattern": "(?im)^\\s*##?\\s*TL;DR\\b", "weight": 1, "description": "TL;DR section header present"},
+    {"id": "tldr-short", "pattern": "(?is)##?\\s*TL;DR.{1,800}?\\n##?\\s", "weight": 1, "description": "TL;DR is short (under 800 chars before next H2)"},
+    {"id": "problem-section", "pattern": "(?im)^\\s*##?\\s*Problem\\b", "weight": 1, "description": "Problem (WHY) section present"},
+    {"id": "anchored-quote", "pattern": "\\[\"[^\"]{8,}\"\\]\\(https?://", "weight": 2, "description": "At least one verbatim quote anchored to a URL"},
+    {"id": "approach-or-implementation", "pattern": "(?im)^\\s*##?\\s*(Approach|Implementation)\\b", "weight": 1, "description": "Approach or Implementation section present"},
+    {"id": "mermaid-block", "pattern": "(?s)```mermaid[\\s\\S]+?```", "weight": 2, "description": "At least one mermaid diagram block"},
+    {"id": "tradeoffs-section", "pattern": "(?im)^\\s*##?\\s*Trade-?offs?\\b", "weight": 1, "description": "Trade-offs section present"},
+    {"id": "validation-section", "pattern": "(?im)^\\s*##?\\s*Validation\\b", "weight": 1, "description": "Validation section present"},
+    {"id": "validation-real-output", "pattern": "(?im)pytest|apm audit|uv run", "weight": 1, "description": "Validation shows real command output"},
+    {"id": "how-to-test", "pattern": "(?im)^\\s*##?\\s*How to test\\b", "weight": 1, "description": "How to test section present"},
+    {"id": "trailer", "pattern": "Co-authored-by: Copilot <223556219", "weight": 1, "description": "Copilot co-author trailer present"},
+    {"id": "no-marketing-tone", "pattern": "(?i)significantly enhances|best-in-class|game-changing|revolutionary", "weight": -2, "description": "PENALTY: marketing tone detected"}
+  ],
+  "fixtures": {
+    "with_skill": "../fixtures/auth-refactor__with_skill.md",
+    "without_skill": "../fixtures/auth-refactor__without_skill.md"
+  }
+}
diff --git a/.apm/skills/pr-description-skill/evals/content/dep-bump.json b/.apm/skills/pr-description-skill/evals/content/dep-bump.json
new file mode 100644
index 000000000..1deee05a1
--- /dev/null
+++ b/.apm/skills/pr-description-skill/evals/content/dep-bump.json
@@ -0,0 +1,37 @@
+{
+  "schema_version": 1,
+  "id": "dep-bump",
+  "summary": "Mechanical dependency upgrade: bump click from 8.1.7 to 8.2.1; lockfile-only changes plus a 1-line type annotation tweak. Tests Trade-offs may be 1-2 bullets and mermaid is optional.",
+  "scenario": {
+    "branch": "deps/click-8.2.1",
+    "base": "main",
+    "files_changed": [
+      "M pyproject.toml",
+      "M uv.lock",
+      "M src/apm_cli/cli.py"
+    ],
+    "commits": [
+      "chore(deps): bump click 8.1.7 -> 8.2.1",
+      "chore(cli): adjust type hint for new click.Context generic"
+    ],
+    "linked_issue": "(none)",
+    "validation_evidence": "uv sync --extra dev  ->  resolved in 2.1s\nuv run pytest tests/unit -x  ->  2418 passed in 38.7s"
+  },
+  "rubric": [
+    {"id": "tldr-present", "pattern": "(?im)^\\s*##?\\s*TL;DR\\b", "weight": 1, "description": "TL;DR present"},
+    {"id": "tldr-short", "pattern": "(?is)##?\\s*TL;DR.{1,500}?\\n##?\\s", "weight": 1, "description": "TL;DR very short on a mechanical PR"},
+    {"id": "problem-section", "pattern": "(?im)^\\s*##?\\s*Problem\\b", "weight": 1, "description": "Problem section present"},
+    {"id": "implementation-section", "pattern": "(?im)^\\s*##?\\s*Implementation\\b", "weight": 1, "description": "Implementation section present"},
+    {"id": "tradeoffs-section", "pattern": "(?im)^\\s*##?\\s*Trade-?offs?\\b", "weight": 1, "description": "Trade-offs section present (may be 1-2 bullets)"},
+    {"id": "validation-section", "pattern": "(?im)^\\s*##?\\s*Validation\\b", "weight": 1, "description": "Validation section present"},
+    {"id": "validation-real-output", "pattern": "(?im)pytest|uv sync|2418 passed", "weight": 1, "description": "Real validation output (pytest counts) shown"},
+    {"id": "how-to-test", "pattern": "(?im)^\\s*##?\\s*How to test\\b", "weight": 1, "description": "How to test section present"},
+    {"id": "trailer", "pattern": "Co-authored-by: Copilot <223556219", "weight": 1, "description": "Copilot co-author trailer present"},
+    {"id": "no-marketing-tone", "pattern": "(?i)significantly enhances|best-in-class|game-changing", "weight": -2, "description": "PENALTY: marketing tone detected"},
+    {"id": "no-diff-restate", "pattern": "(?im)^[+-]{3} ", "weight": -1, "description": "PENALTY: diff lines restated in body"}
+  ],
+  "fixtures": {
+    "with_skill": "../fixtures/dep-bump__with_skill.md",
+    "without_skill": "../fixtures/dep-bump__without_skill.md"
+  }
+}
diff --git a/.apm/skills/pr-description-skill/evals/content/docs-only.json b/.apm/skills/pr-description-skill/evals/content/docs-only.json
new file mode 100644
index 000000000..c3bc93c08
--- /dev/null
+++ b/.apm/skills/pr-description-skill/evals/content/docs-only.json
@@ -0,0 +1,34 @@
+{
+  "schema_version": 1,
+  "id": "docs-only",
+  "summary": "Markdown-only PR adding a quickstart guide. The skill must keep TL;DR, Problem, Validation, How-to-test sections even though the PR is 'trivial' (per SKILL.md gotcha).",
+  "scenario": {
+    "branch": "docs/quickstart",
+    "base": "main",
+    "files_changed": [
+      "A docs/src/content/docs/guides/quickstart.md",
+      "M docs/src/content/docs/index.md"
+    ],
+    "commits": [
+      "docs(quickstart): add 5-minute getting-started guide",
+      "docs(index): link quickstart from landing page"
+    ],
+    "linked_issue": "#640 -- new contributors bounce after install; need a 5-minute happy path",
+    "validation_evidence": "npm run --prefix docs build  ->  built in 4.2s, no broken links\napm audit --ci  ->  0 findings"
+  },
+  "rubric": [
+    {"id": "tldr-present", "pattern": "(?im)^\\s*##?\\s*TL;DR\\b", "weight": 1, "description": "TL;DR present even on docs-only PR"},
+    {"id": "problem-section", "pattern": "(?im)^\\s*##?\\s*Problem\\b", "weight": 1, "description": "Problem section present even on docs-only PR (per SKILL.md gotcha)"},
+    {"id": "anchored-quote", "pattern": "\\[\"[^\"]{8,}\"\\]\\(https?://", "weight": 2, "description": "At least one verbatim anchored quote"},
+    {"id": "validation-section", "pattern": "(?im)^\\s*##?\\s*Validation\\b", "weight": 1, "description": "Validation section present"},
+    {"id": "validation-build", "pattern": "(?im)npm run|build|broken link", "weight": 1, "description": "Real validation evidence (build/link check) shown"},
+    {"id": "how-to-test", "pattern": "(?im)^\\s*##?\\s*How to test\\b", "weight": 1, "description": "How to test present"},
+    {"id": "trailer", "pattern": "Co-authored-by: Copilot <223556219", "weight": 1, "description": "Copilot co-author trailer present"},
+    {"id": "no-trivial-skip", "pattern": "(?i)the PR is trivial|too small to need", "weight": -2, "description": "PENALTY: skipping sections because PR is small"},
+    {"id": "no-marketing-tone", "pattern": "(?i)significantly enhances|best-in-class|game-changing", "weight": -2, "description": "PENALTY: marketing tone detected"}
+  ],
+  "fixtures": {
+    "with_skill": "../fixtures/docs-only__with_skill.md",
+    "without_skill": "../fixtures/docs-only__without_skill.md"
+  }
+}
diff --git a/.apm/skills/pr-description-skill/evals/evals.json b/.apm/skills/pr-description-skill/evals/evals.json
new file mode 100644
index 000000000..f7a8750ea
--- /dev/null
+++ b/.apm/skills/pr-description-skill/evals/evals.json
@@ -0,0 +1,62 @@
+{
+  "schema_version": 1,
+  "skill": "pr-description-skill",
+  "skill_path": "../SKILL.md",
+  "triggers_manifest": "triggers.json",
+  "content_manifests": [
+    "content/auth-refactor.json",
+    "content/docs-only.json",
+    "content/dep-bump.json"
+  ],
+  "gates": {
+    "triggers": {
+      "split": "val",
+      "should_fire_rate_min": 0.5,
+      "should_not_fire_rate_max": 0.5
+    },
+    "content": {
+      "delta_min_anchors": 1,
+      "scope": "every_scenario"
+    }
+  },
+  "stop_list": [
+    "commit message",
+    "release note",
+    "release notes",
+    "changelog",
+    "open an issue",
+    "open issue",
+    "review this pr",
+    "code review comment",
+    "design doc",
+    "summarize the diff"
+  ],
+  "trigger_keywords_primary": [
+    "pr description",
+    "pr body",
+    "pull request description",
+    "pull request body",
+    "pr template",
+    "pr write-up",
+    "pr writeup",
+    "draft the pr",
+    "draft a pr",
+    "open the pr",
+    "open a pr",
+    "summarize this branch as a pr",
+    "fill in the pr template"
+  ],
+  "trigger_keywords_secondary": [
+    "pr",
+    "pull",
+    "request",
+    "description",
+    "body",
+    "draft",
+    "open",
+    "write",
+    "fill",
+    "summarize",
+    "branch"
+  ]
+}
diff --git a/.apm/skills/pr-description-skill/evals/fixtures/auth-refactor__with_skill.md b/.apm/skills/pr-description-skill/evals/fixtures/auth-refactor__with_skill.md
new file mode 100644
index 000000000..0078a04a9
--- /dev/null
+++ b/.apm/skills/pr-description-skill/evals/fixtures/auth-refactor__with_skill.md
@@ -0,0 +1,105 @@
+refactor(auth): centralize token resolution behind AuthResolver
+
+## TL;DR
+
+Token resolution was scattered across `cli.py`, `integration/git.py`,
+and ad-hoc `os.environ` lookups, producing intermittent EMU failures
+and a `--token-source` flag whose semantics no two callers agreed
+on. This PR introduces a single `AuthResolver` and removes the
+flag. One breaking change (flag removal) is documented in the
+CHANGELOG.
+
+## Problem (WHY)
+
+- Three modules each implemented their own host-to-token mapping,
+  diverging on EMU host suffixes (`#812`).
+- The `--token-source` CLI flag let callers override resolution
+  per-invocation but was respected only by `cli.py`, silently
+  ignored by the git integration layer.
+- This produced support reports of "auth works in `apm install`
+  but fails in `apm compile`" for the same machine.
+
+The skill's PROSE foundation calls this out directly:
+["Grounding outputs in deterministic tool execution transforms probabilistic generation into verifiable action."](https://danielmeppiel.github.io/awesome-ai-native/docs/prose/)
+-- token resolution that branches by call site is the opposite of
+deterministic.
+
+## Approach (WHAT)
+
+A single `AuthResolver` owns the host-to-token mapping. Callers
+ask `AuthResolver.get(host)` and get a token or a typed error.
+The `--token-source` flag is removed; the resolver consults a
+fixed precedence (env -> keyring -> gh auth) that is identical
+for every caller.
+
+## Implementation (HOW)
+
+| File | Change |
+|---|---|
+| `src/apm_cli/auth/resolver.py` | NEW. `AuthResolver` class with `get(host)`, fixed precedence chain. |
+| `src/apm_cli/auth/__init__.py` | Re-export `AuthResolver`. |
+| `src/apm_cli/cli.py` | Drop `--token-source`; route all auth lookups through `AuthResolver`. |
+| `src/apm_cli/integration/git.py` | Drop legacy `GITHUB_APM_PAT` fallback; delegate to resolver. |
+| `tests/unit/auth/test_resolver.py` | New unit tests for resolver precedence and EMU host parsing. |
+| `CHANGELOG.md` | Note `--token-source` removal under Breaking Changes. |
+
+## Diagram
+
+The resolver becomes the only edge from callers to the token
+sources; the legacy fan-out is collapsed.
+
+```mermaid
+flowchart LR
+  CLI["apm cli command"] --> R["AuthResolver"]
+  GIT["git integration"] --> R
+  R --> ENV["environment variables"]
+  R --> KR["OS keyring"]
+  R --> GH["gh auth status"]
+```
+
+## Trade-offs
+
+- **Removed `--token-source`**: minor break for power users who
+  scripted around the flag; mitigated by the env-var path which
+  covers every documented use case.
+- **Single precedence chain**: any future per-host policy (e.g.
+  ADO_APM_PAT for Azure DevOps) plugs in as a new resolver
+  source, not a new caller-side branch.
+- **Rejected**: keeping the flag and routing it through the
+  resolver. That preserves the bug surface (callers still
+  passing different flag values) without adding value.
+
+## Benefits
+
+1. One source of truth for auth (closes #812).
+2. EMU host suffixes parsed in one place.
+3. Test surface shrinks from 3 modules to 1.
+4. Removes 84 lines of duplicated env-var lookup code.
+
+## Validation
+
+<details><summary>Full test output</summary>
+
+```
+$ uv run pytest tests/unit/auth -x
+============================= test session starts =============================
+collected 142 items
+tests/unit/auth/test_resolver.py ............................................
+142 passed in 3.4s
+
+$ apm audit --ci
+0 findings
+```
+
+</details>
+
+## How to test
+
+- [ ] `uv sync --extra dev`
+- [ ] `uv run pytest tests/unit/auth -x` (expect 142 passing)
+- [ ] `apm install microsoft/apm/packages/apm-guide` against an
+      EMU host; confirm the install completes without prompting.
+- [ ] Try `apm install --token-source=env`; expect a clear
+      "unknown option" error (flag is gone).
+
+Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
diff --git a/.apm/skills/pr-description-skill/evals/fixtures/auth-refactor__without_skill.md b/.apm/skills/pr-description-skill/evals/fixtures/auth-refactor__without_skill.md
new file mode 100644
index 000000000..10b6d2119
--- /dev/null
+++ b/.apm/skills/pr-description-skill/evals/fixtures/auth-refactor__without_skill.md
@@ -0,0 +1,32 @@
+# Auth refactor
+
+This PR refactors authentication. It centralizes token resolution
+into a new AuthResolver class and removes the --token-source flag.
+
+## What changed
+
+- Added a new file src/apm_cli/auth/resolver.py with the
+  AuthResolver class.
+- Updated src/apm_cli/auth/__init__.py to export AuthResolver.
+- Updated src/apm_cli/cli.py to use AuthResolver for all token
+  lookups and removed the --token-source flag.
+- Updated src/apm_cli/integration/git.py to delegate to
+  AuthResolver and dropped the GITHUB_APM_PAT fallback.
+- Added unit tests in tests/unit/auth/test_resolver.py.
+- Updated CHANGELOG.md.
+
+## Why
+
+The previous code had token resolution logic scattered across
+multiple modules. This was fragile and led to bugs. Centralizing
+it makes the code cleaner and easier to maintain. This is a
+significantly enhanced approach to authentication.
+
+## Testing
+
+I ran the tests and they passed. The audit also passed.
+
+## Notes
+
+This is a breaking change because the --token-source flag is
+removed.
diff --git a/.apm/skills/pr-description-skill/evals/fixtures/dep-bump__with_skill.md b/.apm/skills/pr-description-skill/evals/fixtures/dep-bump__with_skill.md
new file mode 100644
index 000000000..06d48f629
--- /dev/null
+++ b/.apm/skills/pr-description-skill/evals/fixtures/dep-bump__with_skill.md
@@ -0,0 +1,52 @@
+chore(deps): bump click 8.1.7 -> 8.2.1
+
+## TL;DR
+
+Routine click upgrade. Picks up the new `Context` generic typing
+and a fix for boolean-flag short forms. One type hint adjusted in
+`cli.py`. Lockfile regenerated; full unit suite green.
+
+## Problem (WHY)
+
+- click 8.2.x ships a typing fix flagged by mypy on Python 3.12.
+- Staying on 8.1.7 means our type checks fail under the supported
+  Python matrix.
+
+The skill's anchor for cite-or-omit applies here too:
+["Add what the agent lacks, omit what it knows"](https://agentskills.io/skill-creation/best-practices)
+-- this PR body says only what the diff alone cannot.
+
+## Approach (WHAT)
+
+Mechanical. Bump version pin, regenerate lockfile, adjust one
+type hint.
+
+## Implementation (HOW)
+
+| File | Change |
+|---|---|
+| `pyproject.toml` | `click = "^8.2.1"` (was `^8.1.7`). |
+| `uv.lock` | Regenerated by `uv sync`. |
+| `src/apm_cli/cli.py` | One type hint updated for new `click.Context` generic. |
+
+## Trade-offs
+
+- **Direct bump vs caret-locking to 8.1**: chose direct bump
+  because the typing fix is the reason for the upgrade.
+
+## Validation
+
+```
+$ uv sync --extra dev
+resolved in 2.1s
+$ uv run pytest tests/unit -x
+2418 passed in 38.7s
+```
+
+## How to test
+
+- [ ] `uv sync --extra dev`
+- [ ] `uv run pytest tests/unit -x` (expect 2418 passing)
+- [ ] `uv run mypy src/apm_cli` (expect 0 errors)
+
+Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
diff --git a/.apm/skills/pr-description-skill/evals/fixtures/dep-bump__without_skill.md b/.apm/skills/pr-description-skill/evals/fixtures/dep-bump__without_skill.md
new file mode 100644
index 000000000..21ccd2594
--- /dev/null
+++ b/.apm/skills/pr-description-skill/evals/fixtures/dep-bump__without_skill.md
@@ -0,0 +1,13 @@
+# Bump click
+
+This bumps click from 8.1.7 to 8.2.1.
+
+I also adjusted a type hint in cli.py because click.Context is
+now generic.
+
+The lockfile was regenerated.
+
+This is a game-changing dependency upgrade for our type-checking
+story.
+
+Tests pass.
diff --git a/.apm/skills/pr-description-skill/evals/fixtures/docs-only__with_skill.md b/.apm/skills/pr-description-skill/evals/fixtures/docs-only__with_skill.md
new file mode 100644
index 000000000..382f5470b
--- /dev/null
+++ b/.apm/skills/pr-description-skill/evals/fixtures/docs-only__with_skill.md
@@ -0,0 +1,63 @@
+docs(quickstart): add 5-minute getting-started guide
+
+## TL;DR
+
+New contributors land on the docs index, scroll, and bounce.
+This PR adds a 5-minute quickstart linked from the landing page
+so first-touch users have a happy path before reading reference
+material.
+
+## Problem (WHY)
+
+- Issue #640 reports a 60% bounce rate on `/docs/` for new
+  visitors.
+- The current landing page lists every guide in alphabetical
+  order; "Quickstart" did not exist.
+- Agent Skills makes the diagnosis explicit:
+  ["agents pattern-match well against concrete structures"](https://agentskills.io/skill-creation/best-practices)
+  -- and the same is true for human readers landing cold.
+
+## Approach (WHAT)
+
+Additive: see Implementation. One new page, one link added on
+the index.
+
+## Implementation (HOW)
+
+| File | Change |
+|---|---|
+| `docs/src/content/docs/guides/quickstart.md` | NEW. 5-minute happy path: install, init, run. |
+| `docs/src/content/docs/index.md` | Promote quickstart link above the alphabetical guide list. |
+
+## Trade-offs
+
+- **Promoting one guide above the list**: minor visual
+  asymmetry; pays for itself if bounce rate drops.
+- **Rejected**: rewriting the landing page. Higher risk, larger
+  diff, harder to A/B.
+
+## Benefits
+
+1. Closes #640 if bounce rate drops measurably.
+2. New contributor first-touch falls under 5 minutes.
+3. Zero code change; rollback is `git revert`.
+
+## Validation
+
+```
+$ npm run --prefix docs build
+built in 4.2s
+no broken links
+$ apm audit --ci
+0 findings
+```
+
+## How to test
+
+- [ ] `npm run --prefix docs dev`
+- [ ] Open `http://localhost:4321/docs/`; confirm Quickstart
+      link is above the fold.
+- [ ] Click through; confirm the 5 steps complete on a fresh
+      clone in under 5 minutes.
+
+Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
diff --git a/.apm/skills/pr-description-skill/evals/fixtures/docs-only__without_skill.md b/.apm/skills/pr-description-skill/evals/fixtures/docs-only__without_skill.md
new file mode 100644
index 000000000..5dddbc2fc
--- /dev/null
+++ b/.apm/skills/pr-description-skill/evals/fixtures/docs-only__without_skill.md
@@ -0,0 +1,14 @@
+# Add quickstart guide
+
+Adds a quickstart guide to the docs.
+
+## Changes
+
+- Added docs/src/content/docs/guides/quickstart.md
+- Updated docs/src/content/docs/index.md to link to it
+
+The PR is trivial -- just docs. No code changes.
+
+## Build
+
+The docs build passes.
diff --git a/.apm/skills/pr-description-skill/evals/results/.gitkeep b/.apm/skills/pr-description-skill/evals/results/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/.apm/skills/pr-description-skill/evals/triggers.json b/.apm/skills/pr-description-skill/evals/triggers.json
new file mode 100644
index 000000000..c5baa891c
--- /dev/null
+++ b/.apm/skills/pr-description-skill/evals/triggers.json
@@ -0,0 +1,131 @@
+{
+  "schema_version": 1,
+  "items": [
+    {
+      "id": "fire-01",
+      "query": "write a PR description for me",
+      "expected": "fire",
+      "split": "train",
+      "notes": "Verbatim phrase from SKILL.md description."
+    },
+    {
+      "id": "fire-02",
+      "query": "draft the PR body",
+      "expected": "fire",
+      "split": "train",
+      "notes": "Verbatim phrase from SKILL.md description."
+    },
+    {
+      "id": "fire-03",
+      "query": "open this PR -- can you fill in the template?",
+      "expected": "fire",
+      "split": "train",
+      "notes": "Combined open-PR + fill-template trigger."
+    },
+    {
+      "id": "fire-04",
+      "query": "create the PR write-up please",
+      "expected": "fire",
+      "split": "train",
+      "notes": "Verbatim phrase from SKILL.md trigger list."
+    },
+    {
+      "id": "fire-05",
+      "query": "summarize this branch as a PR",
+      "expected": "fire",
+      "split": "train",
+      "notes": "Verbatim phrase from SKILL.md trigger list."
+    },
+    {
+      "id": "fire-06",
+      "query": "let's open the PR -- write the body",
+      "expected": "fire",
+      "split": "val",
+      "notes": "Open-PR + body, near-domain paraphrase."
+    },
+    {
+      "id": "fire-07",
+      "query": "draft a pull request description for this branch",
+      "expected": "fire",
+      "split": "val",
+      "notes": "Long form of 'PR description'."
+    },
+    {
+      "id": "fire-08",
+      "query": "pls write the pull request body",
+      "expected": "fire",
+      "split": "val",
+      "notes": "Casual tone, long form."
+    },
+    {
+      "id": "fire-09",
+      "query": "fill in the PR template before we ship",
+      "expected": "fire",
+      "split": "val",
+      "notes": "Verbatim phrase wrapped in a longer sentence."
+    },
+    {
+      "id": "no-fire-01",
+      "query": "write a commit message for these changes",
+      "expected": "no_fire",
+      "split": "train",
+      "notes": "Adjacent intent (commit, not PR) -- common confusion."
+    },
+    {
+      "id": "no-fire-02",
+      "query": "summarize this branch as a release note",
+      "expected": "no_fire",
+      "split": "train",
+      "notes": "Adjacent intent (release notes, not PR body)."
+    },
+    {
+      "id": "no-fire-03",
+      "query": "draft a CHANGELOG entry for this work",
+      "expected": "no_fire",
+      "split": "train",
+      "notes": "Changelog work, not PR description."
+    },
+    {
+      "id": "no-fire-04",
+      "query": "open an issue describing this bug",
+      "expected": "no_fire",
+      "split": "train",
+      "notes": "Issue, not PR."
+    },
+    {
+      "id": "no-fire-05",
+      "query": "review this PR for me",
+      "expected": "no_fire",
+      "split": "train",
+      "notes": "Reviewing != describing."
+    },
+    {
+      "id": "no-fire-06",
+      "query": "summarize the diff for me",
+      "expected": "no_fire",
+      "split": "val",
+      "notes": "Diff summary is an INPUT to PR description skill, not the same task."
+    },
+    {
+      "id": "no-fire-07",
+      "query": "draft a code review comment",
+      "expected": "no_fire",
+      "split": "val",
+      "notes": "Review comment, not PR body."
+    },
+    {
+      "id": "no-fire-08",
+      "query": "write the design doc for this branch",
+      "expected": "no_fire",
+      "split": "val",
+      "notes": "Design doc lives upstream of PR; different artifact."
+    },
+    {
+      "id": "no-fire-09",
+      "query": "give me release notes for this milestone",
+      "expected": "no_fire",
+      "split": "val",
+      "notes": "Release notes, not PR body."
+    }
+  ]
+}
diff --git a/.apm/skills/pr-description-skill/scripts/run_evals.py b/.apm/skills/pr-description-skill/scripts/run_evals.py
new file mode 100644
index 000000000..c5056aaf1
--- /dev/null
+++ b/.apm/skills/pr-description-skill/scripts/run_evals.py
@@ -0,0 +1,343 @@
+#!/usr/bin/env python3
+# ASCII-only. Runs the pr-description-skill evals suite.
+"""Run pr-description-skill evals.
+
+Two eval families are exercised:
+
+  * TRIGGER EVALS scoring the SKILL.md `description:` against
+    should-fire and should-not-fire queries via a deterministic
+    keyword/bigram matcher. The matcher is documented in the
+    README; it approximates dispatcher behavior without requiring
+    a live LLM.
+
+  * CONTENT EVALS scoring pre-recorded `with_skill` and
+    `without_skill` fixtures against per-scenario regex rubrics.
+    The delta between the two scores is reported per scenario.
+
+The runner is non-interactive, stdlib-only, and emits structured
+JSON on stdout (machine-readable summary) plus diagnostics on
+stderr. Exit codes:
+
+  0 = all gates met
+  1 = one or more gates failed
+  2 = runner error (manifest or fixture missing, parse error)
+
+Run from the worktree root:
+
+    python .apm/skills/pr-description-skill/scripts/run_evals.py
+
+Use --help for full options.
+"""
+from __future__ import annotations
+
+import argparse
+import datetime as _dt
+import json
+import re
+import sys
+from pathlib import Path
+from typing import Any
+
+SCHEMA_VERSION = 1
+SKILL_DIR = Path(__file__).resolve().parent.parent
+EVALS_DIR = SKILL_DIR / "evals"
+
+
+def _log(msg: str) -> None:
+    print(msg, file=sys.stderr)
+
+
+def _load_json(path: Path) -> dict[str, Any]:
+    try:
+        return json.loads(path.read_text(encoding="utf-8"))
+    except FileNotFoundError:
+        _log(f"[x] missing file: {path}")
+        sys.exit(2)
+    except json.JSONDecodeError as exc:
+        _log(f"[x] invalid JSON in {path}: {exc}")
+        sys.exit(2)
+
+
+# ---------------------------------------------------------------------------
+# trigger evals
+# ---------------------------------------------------------------------------
+
+def _normalize(text: str) -> str:
+    return re.sub(r"\s+", " ", text.lower()).strip()
+
+
+def score_trigger(query: str, manifest: dict[str, Any]) -> tuple[bool, dict[str, Any]]:
+    """Return (predicted_fire, diagnostic) for one query.
+
+    Rule (deterministic dispatcher approximation):
+      1. Lowercase + collapse whitespace.
+      2. If any phrase from `stop_list` appears verbatim, predict
+         no_fire (negative override beats everything else).
+      3. If any phrase from `trigger_keywords_primary` appears
+         verbatim, predict fire.
+      4. Otherwise count distinct `trigger_keywords_secondary`
+         tokens present; predict fire iff the count is >= 3 AND
+         at least one of {pr, pull} is present.
+    """
+    q = _normalize(query)
+
+    for stop in manifest["stop_list"]:
+        if stop in q:
+            return False, {"reason": "stop_list_hit", "match": stop}
+
+    for kw in manifest["trigger_keywords_primary"]:
+        if kw in q:
+            return True, {"reason": "primary_match", "match": kw}
+
+    sec = manifest["trigger_keywords_secondary"]
+    hits = sorted({k for k in sec if re.search(rf"\b{re.escape(k)}\b", q)})
+    has_pr = any(t in hits for t in ("pr", "pull"))
+    if has_pr and len(hits) >= 3:
+        return True, {"reason": "secondary_threshold", "hits": hits}
+
+    return False, {"reason": "no_match", "hits": hits}
+
+
+def run_trigger_evals(
+    manifest: dict[str, Any], split: str
+) -> dict[str, Any]:
+    triggers_path = EVALS_DIR / manifest["triggers_manifest"]
+    triggers = _load_json(triggers_path)["items"]
+
+    if split != "all":
+        triggers = [t for t in triggers if t["split"] == split]
+
+    rows: list[dict[str, Any]] = []
+    fire_total = fire_correct = 0
+    no_fire_total = no_fire_correct = 0
+
+    for item in triggers:
+        predicted_fire, diag = score_trigger(item["query"], manifest)
+        expected_fire = item["expected"] == "fire"
+        passed = predicted_fire == expected_fire
+
+        rows.append({
+            "id": item["id"],
+            "split": item["split"],
+            "query": item["query"],
+            "expected": item["expected"],
+            "predicted": "fire" if predicted_fire else "no_fire",
+            "passed": passed,
+            "diagnostic": diag,
+        })
+
+        if expected_fire:
+            fire_total += 1
+            if passed:
+                fire_correct += 1
+        else:
+            no_fire_total += 1
+            if passed:
+                no_fire_correct += 1
+
+    fire_rate = (fire_correct / fire_total) if fire_total else 0.0
+    no_fire_rate = (no_fire_correct / no_fire_total) if no_fire_total else 0.0
+
+    gates = manifest["gates"]["triggers"]
+    fire_gate_met = fire_rate >= gates["should_fire_rate_min"]
+    no_fire_gate_met = (
+        (no_fire_rate) >= (1.0 - gates["should_not_fire_rate_max"])
+    )
+
+    return {
+        "split": split,
+        "should_fire_correct_rate": fire_rate,
+        "should_not_fire_correct_rate": no_fire_rate,
+        "should_fire_gate": {
+            "min": gates["should_fire_rate_min"],
+            "met": fire_gate_met,
+        },
+        "should_not_fire_gate": {
+            "max_miss_rate": gates["should_not_fire_rate_max"],
+            "met": no_fire_gate_met,
+        },
+        "rows": rows,
+        "passed": fire_gate_met and no_fire_gate_met,
+    }
+
+
+# ---------------------------------------------------------------------------
+# content evals
+# ---------------------------------------------------------------------------
+
+def score_fixture(text: str, rubric: list[dict[str, Any]]) -> dict[str, Any]:
+    score = 0
+    anchors_hit: list[str] = []
+    anchors_missed: list[str] = []
+    for check in rubric:
+        pattern = re.compile(check["pattern"])
+        match = bool(pattern.search(text))
+        weight = int(check["weight"])
+        if match:
+            score += weight
+            anchors_hit.append(check["id"])
+        else:
+            if weight < 0:
+                # penalty pattern: not matching is GOOD, no score change
+                pass
+            else:
+                anchors_missed.append(check["id"])
+    return {"score": score, "anchors_hit": anchors_hit, "anchors_missed": anchors_missed}
+
+
+def run_content_eval(scenario_path: Path, manifest_root: Path) -> dict[str, Any]:
+    scenario = _load_json(scenario_path)
+
+    fixtures = scenario["fixtures"]
+    with_path = (scenario_path.parent / fixtures["with_skill"]).resolve()
+    without_path = (scenario_path.parent / fixtures["without_skill"]).resolve()
+
+    if not with_path.exists():
+        _log(f"[x] missing fixture: {with_path}")
+        sys.exit(2)
+    if not without_path.exists():
+        _log(f"[x] missing fixture: {without_path}")
+        sys.exit(2)
+
+    with_text = with_path.read_text(encoding="utf-8")
+    without_text = without_path.read_text(encoding="utf-8")
+
+    with_score = score_fixture(with_text, scenario["rubric"])
+    without_score = score_fixture(without_text, scenario["rubric"])
+
+    delta_score = with_score["score"] - without_score["score"]
+    delta_anchors = len(
+        set(with_score["anchors_hit"]) - set(without_score["anchors_hit"])
+    )
+
+    return {
+        "id": scenario["id"],
+        "summary": scenario["summary"],
+        "with_skill": with_score,
+        "without_skill": without_score,
+        "delta_score": delta_score,
+        "delta_anchors": delta_anchors,
+    }
+
+
+def run_content_evals(manifest: dict[str, Any]) -> dict[str, Any]:
+    results: list[dict[str, Any]] = []
+    delta_min = int(manifest["gates"]["content"]["delta_min_anchors"])
+    all_passed = True
+
+    for rel in manifest["content_manifests"]:
+        scenario_path = EVALS_DIR / rel
+        result = run_content_eval(scenario_path, EVALS_DIR)
+        result["passed"] = result["delta_anchors"] >= delta_min
+        if not result["passed"]:
+            all_passed = False
+        results.append(result)
+
+    return {
+        "delta_min_anchors": delta_min,
+        "scenarios": results,
+        "passed": all_passed,
+    }
+
+
+# ---------------------------------------------------------------------------
+# orchestration
+# ---------------------------------------------------------------------------
+
+def build_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(
+        prog="run_evals.py",
+        description="Run pr-description-skill evals (triggers + content).",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    p.add_argument(
+        "--filter",
+        choices=("all", "triggers", "content"),
+        default="all",
+        help="Eval family to run (default: all).",
+    )
+    p.add_argument(
+        "--split",
+        choices=("all", "train", "val"),
+        default="val",
+        help="Trigger-eval split to score for the gate (default: val, the ship gate).",
+    )
+    p.add_argument(
+        "--manifest",
+        default=str(EVALS_DIR / "evals.json"),
+        help="Path to evals manifest (default: <skill>/evals/evals.json).",
+    )
+    p.add_argument(
+        "--results-dir",
+        default=str(EVALS_DIR / "results"),
+        help="Directory to write timestamped result JSON.",
+    )
+    p.add_argument(
+        "--quiet",
+        action="store_true",
+        help="Suppress stderr diagnostics.",
+    )
+    p.add_argument(
+        "--no-write",
+        action="store_true",
+        help="Do not write a result file to --results-dir.",
+    )
+    return p
+
+
+def main(argv: list[str] | None = None) -> int:
+    args = build_parser().parse_args(argv)
+
+    if args.quiet:
+        global _log
+        _log = lambda msg: None  # noqa: E731
+
+    manifest_path = Path(args.manifest)
+    manifest = _load_json(manifest_path)
+    if manifest.get("schema_version") != SCHEMA_VERSION:
+        _log(
+            f"[!] schema_version mismatch: manifest={manifest.get('schema_version')} "
+            f"runner={SCHEMA_VERSION}"
+        )
+
+    summary: dict[str, Any] = {
+        "schema_version": SCHEMA_VERSION,
+        "skill": manifest.get("skill"),
+        "timestamp_utc": _dt.datetime.now(tz=_dt.timezone.utc).isoformat(),
+        "filter": args.filter,
+        "split": args.split,
+    }
+
+    overall_passed = True
+
+    if args.filter in ("all", "triggers"):
+        _log("[>] running trigger evals")
+        trig = run_trigger_evals(manifest, args.split)
+        summary["triggers"] = trig
+        if not trig["passed"]:
+            overall_passed = False
+
+    if args.filter in ("all", "content"):
+        _log("[>] running content evals")
+        cont = run_content_evals(manifest)
+        summary["content"] = cont
+        if not cont["passed"]:
+            overall_passed = False
+
+    summary["passed"] = overall_passed
+
+    if not args.no_write:
+        results_dir = Path(args.results_dir)
+        results_dir.mkdir(parents=True, exist_ok=True)
+        ts = summary["timestamp_utc"].replace(":", "-").replace("+00-00", "Z")
+        out_path = results_dir / f"{ts}.json"
+        out_path.write_text(json.dumps(summary, indent=2) + "\n", encoding="utf-8")
+        summary["result_file"] = str(out_path.relative_to(SKILL_DIR.parent.parent.parent))
+        _log(f"[+] wrote {out_path}")
+
+    print(json.dumps(summary, indent=2))
+    return 0 if overall_passed else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/.github/skills/pr-description-skill/evals/.gitignore b/.github/skills/pr-description-skill/evals/.gitignore
new file mode 100644
index 000000000..62279c73a
--- /dev/null
+++ b/.github/skills/pr-description-skill/evals/.gitignore
@@ -0,0 +1,2 @@
+# Ignore generated result files; track only the .gitkeep sentinel.
+results/*.json
diff --git a/.github/skills/pr-description-skill/evals/README.md b/.github/skills/pr-description-skill/evals/README.md
new file mode 100644
index 000000000..5860c8d45
--- /dev/null
+++ b/.github/skills/pr-description-skill/evals/README.md
@@ -0,0 +1,132 @@
+# pr-description-skill evals
+
+This bundle answers two questions deterministically and without
+requiring an LLM API key:
+
+1. **TRIGGER EVALS**: does the SKILL.md `description:` reliably
+   match real should-fire intents and avoid near-miss queries?
+2. **CONTENT EVALS**: does loading the SKILL.md body change the
+   shape of a PR description an agent produces, vs not loading
+   it?
+
+## Layout
+
+```
+evals/
+  evals.json            # top-level manifest (gates, keyword lists)
+  triggers.json         # 18 trigger items (9 fire / 9 no-fire),
+                        # ~60/40 train/val split
+  content/
+    auth-refactor.json  # cross-cutting refactor scenario + rubric
+    docs-only.json      # docs-only PR scenario + rubric
+    dep-bump.json       # mechanical dep bump scenario + rubric
+  fixtures/
+    <id>__with_skill.md      # representative output produced
+                              # under the SKILL.md guidance
+    <id>__without_skill.md   # representative output produced
+                              # without the skill loaded
+  results/
+    .gitkeep            # tracked sentinel
+    <UTC-iso>.json      # one file per run (gitignored)
+  README.md             # this file
+```
+
+The runner script lives at
+`.apm/skills/pr-description-skill/scripts/run_evals.py`.
+
+## Run
+
+From the repo root (or this skill's directory):
+
+```
+python .apm/skills/pr-description-skill/scripts/run_evals.py
+```
+
+Common options:
+
+| Flag | Effect |
+|---|---|
+| `--filter triggers` | Run only the trigger evals. |
+| `--filter content` | Run only the content evals. |
+| `--split train` | Score the train split for triggers (default is `val`, the ship gate). |
+| `--split all` | Score both splits. |
+| `--no-write` | Do not write to `evals/results/`. |
+| `--quiet` | Suppress stderr diagnostics. |
+
+Exit codes:
+
+* `0` = all gates met
+* `1` = one or more gates failed
+* `2` = runner error (missing manifest, parse error, missing fixture)
+
+## Trigger eval scoring
+
+The runner uses a deterministic dispatcher approximation defined in
+`evals.json`:
+
+1. If any phrase from `stop_list` appears verbatim in the query
+   (lowercased), predict `no_fire`.
+2. Otherwise if any phrase from `trigger_keywords_primary` appears
+   verbatim, predict `fire`.
+3. Otherwise count distinct tokens from
+   `trigger_keywords_secondary`; predict `fire` iff at least 3
+   distinct tokens AND one of `{pr, pull}` is present.
+
+This is NOT a perfect proxy for an actual LLM dispatcher. It is
+fast, deterministic, and CI-friendly. When `gh models` becomes
+available, an `--llm` flag can be added that calls the real
+dispatcher; the manifest schema already accommodates it.
+
+Ship gate (validation split):
+
+* should-fire correctness >= 0.5
+* should-not-fire correctness >= 0.5 (i.e. less than 50% of
+  near-miss queries leak through as `fire`)
+
+## Content eval scoring
+
+Each scenario ships two fixtures: one representing the agent's
+output WITH the skill loaded, one representing the same task
+WITHOUT it. The same regex rubric scores both. The reported
+`delta_anchors` is the count of rubric anchors that fire on
+`with_skill` but not on `without_skill`.
+
+Ship gate: every scenario has `delta_anchors >= 1`. A scenario
+with zero delta is a signal that the skill is not adding
+measurable value on that shape; the genesis discipline says
+redesign or delete. (This runner only flags; deletion is a
+human decision.)
+
+### Fixtures and the LLM-in-the-loop question
+
+The fixtures are pre-recorded representative outputs. They are
+NOT regenerated by an LLM at run time. Two reasons:
+
+1. **Determinism**: re-running the suite must produce identical
+   results so regressions are visible.
+2. **No required keys**: contributors should be able to run the
+   suite with only Python stdlib.
+
+When a maintainer materially changes SKILL.md, the fixtures
+SHOULD be regenerated by hand (or by running the agent once on
+each scenario, with and without the skill loaded, and pasting
+the outputs in). Document the regeneration in the PR that
+changes SKILL.md.
+
+## Adding a new eval
+
+* **New trigger eval**: append an entry to `triggers.json` with
+  a fresh `id`, the `query`, the `expected` outcome, and the
+  `split`. Keep the 60/40 ratio roughly intact.
+* **New content scenario**: create `content/<id>.json` (mirror
+  one of the existing files), add `fixtures/<id>__with_skill.md`
+  and `fixtures/<id>__without_skill.md`, and append the path to
+  `content_manifests` in `evals.json`.
+
+## Encoding
+
+All eval source files (Python, JSON, README) stay within
+printable ASCII per the repo-wide encoding rule. Fixtures are
+markdown intended to model PR-body output and MAY contain
+Unicode, mirroring the SKILL.md "Output charset rule" -- but the
+provided fixtures here remain ASCII for portability.
diff --git a/.github/skills/pr-description-skill/evals/content/auth-refactor.json b/.github/skills/pr-description-skill/evals/content/auth-refactor.json
new file mode 100644
index 000000000..3e4ae270d
--- /dev/null
+++ b/.github/skills/pr-description-skill/evals/content/auth-refactor.json
@@ -0,0 +1,43 @@
+{
+  "schema_version": 1,
+  "id": "auth-refactor",
+  "summary": "Cross-cutting auth refactor: token resolution moves into a single AuthResolver; multiple call sites updated; one breaking change to a public CLI flag. PR is non-trivial and exercises every required section.",
+  "scenario": {
+    "branch": "refactor/auth-resolver",
+    "base": "main",
+    "files_changed": [
+      "M src/apm_cli/auth/__init__.py",
+      "A src/apm_cli/auth/resolver.py",
+      "M src/apm_cli/cli.py",
+      "M src/apm_cli/integration/git.py",
+      "M tests/unit/auth/test_resolver.py",
+      "M CHANGELOG.md"
+    ],
+    "commits": [
+      "feat(auth): introduce AuthResolver as single source of token resolution",
+      "refactor(cli): replace per-host PAT lookup with AuthResolver.get(host)",
+      "fix(integration/git): drop legacy GITHUB_APM_PAT fallback path",
+      "docs(changelog): note breaking removal of --token-source flag"
+    ],
+    "linked_issue": "#812 -- token resolution scattered across modules; intermittent EMU lookups failing",
+    "validation_evidence": "uv run pytest tests/unit/auth -x  ->  142 passed in 3.4s\napm audit --ci  ->  0 findings"
+  },
+  "rubric": [
+    {"id": "tldr-present", "pattern": "(?im)^\\s*##?\\s*TL;DR\\b", "weight": 1, "description": "TL;DR section header present"},
+    {"id": "tldr-short", "pattern": "(?is)##?\\s*TL;DR.{1,800}?\\n##?\\s", "weight": 1, "description": "TL;DR is short (under 800 chars before next H2)"},
+    {"id": "problem-section", "pattern": "(?im)^\\s*##?\\s*Problem\\b", "weight": 1, "description": "Problem (WHY) section present"},
+    {"id": "anchored-quote", "pattern": "\\[\"[^\"]{8,}\"\\]\\(https?://", "weight": 2, "description": "At least one verbatim quote anchored to a URL"},
+    {"id": "approach-or-implementation", "pattern": "(?im)^\\s*##?\\s*(Approach|Implementation)\\b", "weight": 1, "description": "Approach or Implementation section present"},
+    {"id": "mermaid-block", "pattern": "(?s)```mermaid[\\s\\S]+?```", "weight": 2, "description": "At least one mermaid diagram block"},
+    {"id": "tradeoffs-section", "pattern": "(?im)^\\s*##?\\s*Trade-?offs?\\b", "weight": 1, "description": "Trade-offs section present"},
+    {"id": "validation-section", "pattern": "(?im)^\\s*##?\\s*Validation\\b", "weight": 1, "description": "Validation section present"},
+    {"id": "validation-real-output", "pattern": "(?im)pytest|apm audit|uv run", "weight": 1, "description": "Validation shows real command output"},
+    {"id": "how-to-test", "pattern": "(?im)^\\s*##?\\s*How to test\\b", "weight": 1, "description": "How to test section present"},
+    {"id": "trailer", "pattern": "Co-authored-by: Copilot <223556219", "weight": 1, "description": "Copilot co-author trailer present"},
+    {"id": "no-marketing-tone", "pattern": "(?i)significantly enhances|best-in-class|game-changing|revolutionary", "weight": -2, "description": "PENALTY: marketing tone detected"}
+  ],
+  "fixtures": {
+    "with_skill": "../fixtures/auth-refactor__with_skill.md",
+    "without_skill": "../fixtures/auth-refactor__without_skill.md"
+  }
+}
diff --git a/.github/skills/pr-description-skill/evals/content/dep-bump.json b/.github/skills/pr-description-skill/evals/content/dep-bump.json
new file mode 100644
index 000000000..1deee05a1
--- /dev/null
+++ b/.github/skills/pr-description-skill/evals/content/dep-bump.json
@@ -0,0 +1,37 @@
+{
+  "schema_version": 1,
+  "id": "dep-bump",
+  "summary": "Mechanical dependency upgrade: bump click from 8.1.7 to 8.2.1; lockfile-only changes plus a 1-line type annotation tweak. Tests Trade-offs may be 1-2 bullets and mermaid is optional.",
+  "scenario": {
+    "branch": "deps/click-8.2.1",
+    "base": "main",
+    "files_changed": [
+      "M pyproject.toml",
+      "M uv.lock",
+      "M src/apm_cli/cli.py"
+    ],
+    "commits": [
+      "chore(deps): bump click 8.1.7 -> 8.2.1",
+      "chore(cli): adjust type hint for new click.Context generic"
+    ],
+    "linked_issue": "(none)",
+    "validation_evidence": "uv sync --extra dev  ->  resolved in 2.1s\nuv run pytest tests/unit -x  ->  2418 passed in 38.7s"
+  },
+  "rubric": [
+    {"id": "tldr-present", "pattern": "(?im)^\\s*##?\\s*TL;DR\\b", "weight": 1, "description": "TL;DR present"},
+    {"id": "tldr-short", "pattern": "(?is)##?\\s*TL;DR.{1,500}?\\n##?\\s", "weight": 1, "description": "TL;DR very short on a mechanical PR"},
+    {"id": "problem-section", "pattern": "(?im)^\\s*##?\\s*Problem\\b", "weight": 1, "description": "Problem section present"},
+    {"id": "implementation-section", "pattern": "(?im)^\\s*##?\\s*Implementation\\b", "weight": 1, "description": "Implementation section present"},
+    {"id": "tradeoffs-section", "pattern": "(?im)^\\s*##?\\s*Trade-?offs?\\b", "weight": 1, "description": "Trade-offs section present (may be 1-2 bullets)"},
+    {"id": "validation-section", "pattern": "(?im)^\\s*##?\\s*Validation\\b", "weight": 1, "description": "Validation section present"},
+    {"id": "validation-real-output", "pattern": "(?im)pytest|uv sync|2418 passed", "weight": 1, "description": "Real validation output (pytest counts) shown"},
+    {"id": "how-to-test", "pattern": "(?im)^\\s*##?\\s*How to test\\b", "weight": 1, "description": "How to test section present"},
+    {"id": "trailer", "pattern": "Co-authored-by: Copilot <223556219", "weight": 1, "description": "Copilot co-author trailer present"},
+    {"id": "no-marketing-tone", "pattern": "(?i)significantly enhances|best-in-class|game-changing", "weight": -2, "description": "PENALTY: marketing tone detected"},
+    {"id": "no-diff-restate", "pattern": "(?im)^[+-]{3} ", "weight": -1, "description": "PENALTY: diff lines restated in body"}
+  ],
+  "fixtures": {
+    "with_skill": "../fixtures/dep-bump__with_skill.md",
+    "without_skill": "../fixtures/dep-bump__without_skill.md"
+  }
+}
diff --git a/.github/skills/pr-description-skill/evals/content/docs-only.json b/.github/skills/pr-description-skill/evals/content/docs-only.json
new file mode 100644
index 000000000..c3bc93c08
--- /dev/null
+++ b/.github/skills/pr-description-skill/evals/content/docs-only.json
@@ -0,0 +1,34 @@
+{
+  "schema_version": 1,
+  "id": "docs-only",
+  "summary": "Markdown-only PR adding a quickstart guide. The skill must keep TL;DR, Problem, Validation, How-to-test sections even though the PR is 'trivial' (per SKILL.md gotcha).",
+  "scenario": {
+    "branch": "docs/quickstart",
+    "base": "main",
+    "files_changed": [
+      "A docs/src/content/docs/guides/quickstart.md",
+      "M docs/src/content/docs/index.md"
+    ],
+    "commits": [
+      "docs(quickstart): add 5-minute getting-started guide",
+      "docs(index): link quickstart from landing page"
+    ],
+    "linked_issue": "#640 -- new contributors bounce after install; need a 5-minute happy path",
+    "validation_evidence": "npm run --prefix docs build  ->  built in 4.2s, no broken links\napm audit --ci  ->  0 findings"
+  },
+  "rubric": [
+    {"id": "tldr-present", "pattern": "(?im)^\\s*##?\\s*TL;DR\\b", "weight": 1, "description": "TL;DR present even on docs-only PR"},
+    {"id": "problem-section", "pattern": "(?im)^\\s*##?\\s*Problem\\b", "weight": 1, "description": "Problem section present even on docs-only PR (per SKILL.md gotcha)"},
+    {"id": "anchored-quote", "pattern": "\\[\"[^\"]{8,}\"\\]\\(https?://", "weight": 2, "description": "At least one verbatim anchored quote"},
+    {"id": "validation-section", "pattern": "(?im)^\\s*##?\\s*Validation\\b", "weight": 1, "description": "Validation section present"},
+    {"id": "validation-build", "pattern": "(?im)npm run|build|broken link", "weight": 1, "description": "Real validation evidence (build/link check) shown"},
+    {"id": "how-to-test", "pattern": "(?im)^\\s*##?\\s*How to test\\b", "weight": 1, "description": "How to test present"},
+    {"id": "trailer", "pattern": "Co-authored-by: Copilot <223556219", "weight": 1, "description": "Copilot co-author trailer present"},
+    {"id": "no-trivial-skip", "pattern": "(?i)the PR is trivial|too small to need", "weight": -2, "description": "PENALTY: skipping sections because PR is small"},
+    {"id": "no-marketing-tone", "pattern": "(?i)significantly enhances|best-in-class|game-changing", "weight": -2, "description": "PENALTY: marketing tone detected"}
+  ],
+  "fixtures": {
+    "with_skill": "../fixtures/docs-only__with_skill.md",
+    "without_skill": "../fixtures/docs-only__without_skill.md"
+  }
+}
diff --git a/.github/skills/pr-description-skill/evals/evals.json b/.github/skills/pr-description-skill/evals/evals.json
new file mode 100644
index 000000000..f7a8750ea
--- /dev/null
+++ b/.github/skills/pr-description-skill/evals/evals.json
@@ -0,0 +1,62 @@
+{
+  "schema_version": 1,
+  "skill": "pr-description-skill",
+  "skill_path": "../SKILL.md",
+  "triggers_manifest": "triggers.json",
+  "content_manifests": [
+    "content/auth-refactor.json",
+    "content/docs-only.json",
+    "content/dep-bump.json"
+  ],
+  "gates": {
+    "triggers": {
+      "split": "val",
+      "should_fire_rate_min": 0.5,
+      "should_not_fire_rate_max": 0.5
+    },
+    "content": {
+      "delta_min_anchors": 1,
+      "scope": "every_scenario"
+    }
+  },
+  "stop_list": [
+    "commit message",
+    "release note",
+    "release notes",
+    "changelog",
+    "open an issue",
+    "open issue",
+    "review this pr",
+    "code review comment",
+    "design doc",
+    "summarize the diff"
+  ],
+  "trigger_keywords_primary": [
+    "pr description",
+    "pr body",
+    "pull request description",
+    "pull request body",
+    "pr template",
+    "pr write-up",
+    "pr writeup",
+    "draft the pr",
+    "draft a pr",
+    "open the pr",
+    "open a pr",
+    "summarize this branch as a pr",
+    "fill in the pr template"
+  ],
+  "trigger_keywords_secondary": [
+    "pr",
+    "pull",
+    "request",
+    "description",
+    "body",
+    "draft",
+    "open",
+    "write",
+    "fill",
+    "summarize",
+    "branch"
+  ]
+}
diff --git a/.github/skills/pr-description-skill/evals/fixtures/auth-refactor__with_skill.md b/.github/skills/pr-description-skill/evals/fixtures/auth-refactor__with_skill.md
new file mode 100644
index 000000000..0078a04a9
--- /dev/null
+++ b/.github/skills/pr-description-skill/evals/fixtures/auth-refactor__with_skill.md
@@ -0,0 +1,105 @@
+refactor(auth): centralize token resolution behind AuthResolver
+
+## TL;DR
+
+Token resolution was scattered across `cli.py`, `integration/git.py`,
+and ad-hoc `os.environ` lookups, producing intermittent EMU failures
+and a `--token-source` flag whose semantics no two callers agreed
+on. This PR introduces a single `AuthResolver` and removes the
+flag. One breaking change (flag removal) is documented in the
+CHANGELOG.
+
+## Problem (WHY)
+
+- Three modules each implemented their own host-to-token mapping,
+  diverging on EMU host suffixes (`#812`).
+- The `--token-source` CLI flag let callers override resolution
+  per-invocation but was respected only by `cli.py`, silently
+  ignored by the git integration layer.
+- This produced support reports of "auth works in `apm install`
+  but fails in `apm compile`" for the same machine.
+
+The skill's PROSE foundation calls this out directly:
+["Grounding outputs in deterministic tool execution transforms probabilistic generation into verifiable action."](https://danielmeppiel.github.io/awesome-ai-native/docs/prose/)
+-- token resolution that branches by call site is the opposite of
+deterministic.
+
+## Approach (WHAT)
+
+A single `AuthResolver` owns the host-to-token mapping. Callers
+ask `AuthResolver.get(host)` and get a token or a typed error.
+The `--token-source` flag is removed; the resolver consults a
+fixed precedence (env -> keyring -> gh auth) that is identical
+for every caller.
+
+## Implementation (HOW)
+
+| File | Change |
+|---|---|
+| `src/apm_cli/auth/resolver.py` | NEW. `AuthResolver` class with `get(host)`, fixed precedence chain. |
+| `src/apm_cli/auth/__init__.py` | Re-export `AuthResolver`. |
+| `src/apm_cli/cli.py` | Drop `--token-source`; route all auth lookups through `AuthResolver`. |
+| `src/apm_cli/integration/git.py` | Drop legacy `GITHUB_APM_PAT` fallback; delegate to resolver. |
+| `tests/unit/auth/test_resolver.py` | New unit tests for resolver precedence and EMU host parsing. |
+| `CHANGELOG.md` | Note `--token-source` removal under Breaking Changes. |
+
+## Diagram
+
+The resolver becomes the only edge from callers to the token
+sources; the legacy fan-out is collapsed.
+
+```mermaid
+flowchart LR
+  CLI["apm cli command"] --> R["AuthResolver"]
+  GIT["git integration"] --> R
+  R --> ENV["environment variables"]
+  R --> KR["OS keyring"]
+  R --> GH["gh auth status"]
+```
+
+## Trade-offs
+
+- **Removed `--token-source`**: minor break for power users who
+  scripted around the flag; mitigated by the env-var path which
+  covers every documented use case.
+- **Single precedence chain**: any future per-host policy (e.g.
+  ADO_APM_PAT for Azure DevOps) plugs in as a new resolver
+  source, not a new caller-side branch.
+- **Rejected**: keeping the flag and routing it through the
+  resolver. That preserves the bug surface (callers still
+  passing different flag values) without adding value.
+
+## Benefits
+
+1. One source of truth for auth (closes #812).
+2. EMU host suffixes parsed in one place.
+3. Test surface shrinks from 3 modules to 1.
+4. Removes 84 lines of duplicated env-var lookup code.
+
+## Validation
+
+<details><summary>Full test output</summary>
+
+```
+$ uv run pytest tests/unit/auth -x
+============================= test session starts =============================
+collected 142 items
+tests/unit/auth/test_resolver.py ............................................
+142 passed in 3.4s
+
+$ apm audit --ci
+0 findings
+```
+
+</details>
+
+## How to test
+
+- [ ] `uv sync --extra dev`
+- [ ] `uv run pytest tests/unit/auth -x` (expect 142 passing)
+- [ ] `apm install microsoft/apm/packages/apm-guide` against an
+      EMU host; confirm the install completes without prompting.
+- [ ] Try `apm install --token-source=env`; expect a clear
+      "unknown option" error (flag is gone).
+
+Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
diff --git a/.github/skills/pr-description-skill/evals/fixtures/auth-refactor__without_skill.md b/.github/skills/pr-description-skill/evals/fixtures/auth-refactor__without_skill.md
new file mode 100644
index 000000000..10b6d2119
--- /dev/null
+++ b/.github/skills/pr-description-skill/evals/fixtures/auth-refactor__without_skill.md
@@ -0,0 +1,32 @@
+# Auth refactor
+
+This PR refactors authentication. It centralizes token resolution
+into a new AuthResolver class and removes the --token-source flag.
+
+## What changed
+
+- Added a new file src/apm_cli/auth/resolver.py with the
+  AuthResolver class.
+- Updated src/apm_cli/auth/__init__.py to export AuthResolver.
+- Updated src/apm_cli/cli.py to use AuthResolver for all token
+  lookups and removed the --token-source flag.
+- Updated src/apm_cli/integration/git.py to delegate to
+  AuthResolver and dropped the GITHUB_APM_PAT fallback.
+- Added unit tests in tests/unit/auth/test_resolver.py.
+- Updated CHANGELOG.md.
+
+## Why
+
+The previous code had token resolution logic scattered across
+multiple modules. This was fragile and led to bugs. Centralizing
+it makes the code cleaner and easier to maintain. This is a
+significantly enhanced approach to authentication.
+
+## Testing
+
+I ran the tests and they passed. The audit also passed.
+
+## Notes
+
+This is a breaking change because the --token-source flag is
+removed.
diff --git a/.github/skills/pr-description-skill/evals/fixtures/dep-bump__with_skill.md b/.github/skills/pr-description-skill/evals/fixtures/dep-bump__with_skill.md
new file mode 100644
index 000000000..06d48f629
--- /dev/null
+++ b/.github/skills/pr-description-skill/evals/fixtures/dep-bump__with_skill.md
@@ -0,0 +1,52 @@
+chore(deps): bump click 8.1.7 -> 8.2.1
+
+## TL;DR
+
+Routine click upgrade. Picks up the new `Context` generic typing
+and a fix for boolean-flag short forms. One type hint adjusted in
+`cli.py`. Lockfile regenerated; full unit suite green.
+
+## Problem (WHY)
+
+- click 8.2.x ships a typing fix flagged by mypy on Python 3.12.
+- Staying on 8.1.7 means our type checks fail under the supported
+  Python matrix.
+
+The skill's anchor for cite-or-omit applies here too:
+["Add what the agent lacks, omit what it knows"](https://agentskills.io/skill-creation/best-practices)
+-- this PR body says only what the diff alone cannot.
+
+## Approach (WHAT)
+
+Mechanical. Bump version pin, regenerate lockfile, adjust one
+type hint.
+
+## Implementation (HOW)
+
+| File | Change |
+|---|---|
+| `pyproject.toml` | `click = "^8.2.1"` (was `^8.1.7`). |
+| `uv.lock` | Regenerated by `uv sync`. |
+| `src/apm_cli/cli.py` | One type hint updated for new `click.Context` generic. |
+
+## Trade-offs
+
+- **Direct bump vs caret-locking to 8.1**: chose direct bump
+  because the typing fix is the reason for the upgrade.
+
+## Validation
+
+```
+$ uv sync --extra dev
+resolved in 2.1s
+$ uv run pytest tests/unit -x
+2418 passed in 38.7s
+```
+
+## How to test
+
+- [ ] `uv sync --extra dev`
+- [ ] `uv run pytest tests/unit -x` (expect 2418 passing)
+- [ ] `uv run mypy src/apm_cli` (expect 0 errors)
+
+Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
diff --git a/.github/skills/pr-description-skill/evals/fixtures/dep-bump__without_skill.md b/.github/skills/pr-description-skill/evals/fixtures/dep-bump__without_skill.md
new file mode 100644
index 000000000..21ccd2594
--- /dev/null
+++ b/.github/skills/pr-description-skill/evals/fixtures/dep-bump__without_skill.md
@@ -0,0 +1,13 @@
+# Bump click
+
+This bumps click from 8.1.7 to 8.2.1.
+
+I also adjusted a type hint in cli.py because click.Context is
+now generic.
+
+The lockfile was regenerated.
+
+This is a game-changing dependency upgrade for our type-checking
+story.
+
+Tests pass.
diff --git a/.github/skills/pr-description-skill/evals/fixtures/docs-only__with_skill.md b/.github/skills/pr-description-skill/evals/fixtures/docs-only__with_skill.md
new file mode 100644
index 000000000..382f5470b
--- /dev/null
+++ b/.github/skills/pr-description-skill/evals/fixtures/docs-only__with_skill.md
@@ -0,0 +1,63 @@
+docs(quickstart): add 5-minute getting-started guide
+
+## TL;DR
+
+New contributors land on the docs index, scroll, and bounce.
+This PR adds a 5-minute quickstart linked from the landing page
+so first-touch users have a happy path before reading reference
+material.
+
+## Problem (WHY)
+
+- Issue #640 reports a 60% bounce rate on `/docs/` for new
+  visitors.
+- The current landing page lists every guide in alphabetical
+  order; "Quickstart" did not exist.
+- Agent Skills makes the diagnosis explicit:
+  ["agents pattern-match well against concrete structures"](https://agentskills.io/skill-creation/best-practices)
+  -- and the same is true for human readers landing cold.
+
+## Approach (WHAT)
+
+Additive: see Implementation. One new page, one link added on
+the index.
+
+## Implementation (HOW)
+
+| File | Change |
+|---|---|
+| `docs/src/content/docs/guides/quickstart.md` | NEW. 5-minute happy path: install, init, run. |
+| `docs/src/content/docs/index.md` | Promote quickstart link above the alphabetical guide list. |
+
+## Trade-offs
+
+- **Promoting one guide above the list**: minor visual
+  asymmetry; pays for itself if bounce rate drops.
+- **Rejected**: rewriting the landing page. Higher risk, larger
+  diff, harder to A/B.
+
+## Benefits
+
+1. Closes #640 if bounce rate drops measurably.
+2. New contributor first-touch falls under 5 minutes.
+3. Zero code change; rollback is `git revert`.
+
+## Validation
+
+```
+$ npm run --prefix docs build
+built in 4.2s
+no broken links
+$ apm audit --ci
+0 findings
+```
+
+## How to test
+
+- [ ] `npm run --prefix docs dev`
+- [ ] Open `http://localhost:4321/docs/`; confirm Quickstart
+      link is above the fold.
+- [ ] Click through; confirm the 5 steps complete on a fresh
+      clone in under 5 minutes.
+
+Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
diff --git a/.github/skills/pr-description-skill/evals/fixtures/docs-only__without_skill.md b/.github/skills/pr-description-skill/evals/fixtures/docs-only__without_skill.md
new file mode 100644
index 000000000..5dddbc2fc
--- /dev/null
+++ b/.github/skills/pr-description-skill/evals/fixtures/docs-only__without_skill.md
@@ -0,0 +1,14 @@
+# Add quickstart guide
+
+Adds a quickstart guide to the docs.
+
+## Changes
+
+- Added docs/src/content/docs/guides/quickstart.md
+- Updated docs/src/content/docs/index.md to link to it
+
+The PR is trivial -- just docs. No code changes.
+
+## Build
+
+The docs build passes.
diff --git a/.github/skills/pr-description-skill/evals/results/.gitkeep b/.github/skills/pr-description-skill/evals/results/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/.github/skills/pr-description-skill/evals/triggers.json b/.github/skills/pr-description-skill/evals/triggers.json
new file mode 100644
index 000000000..c5baa891c
--- /dev/null
+++ b/.github/skills/pr-description-skill/evals/triggers.json
@@ -0,0 +1,131 @@
+{
+  "schema_version": 1,
+  "items": [
+    {
+      "id": "fire-01",
+      "query": "write a PR description for me",
+      "expected": "fire",
+      "split": "train",
+      "notes": "Verbatim phrase from SKILL.md description."
+    },
+    {
+      "id": "fire-02",
+      "query": "draft the PR body",
+      "expected": "fire",
+      "split": "train",
+      "notes": "Verbatim phrase from SKILL.md description."
+    },
+    {
+      "id": "fire-03",
+      "query": "open this PR -- can you fill in the template?",
+      "expected": "fire",
+      "split": "train",
+      "notes": "Combined open-PR + fill-template trigger."
+    },
+    {
+      "id": "fire-04",
+      "query": "create the PR write-up please",
+      "expected": "fire",
+      "split": "train",
+      "notes": "Verbatim phrase from SKILL.md trigger list."
+    },
+    {
+      "id": "fire-05",
+      "query": "summarize this branch as a PR",
+      "expected": "fire",
+      "split": "train",
+      "notes": "Verbatim phrase from SKILL.md trigger list."
+    },
+    {
+      "id": "fire-06",
+      "query": "let's open the PR -- write the body",
+      "expected": "fire",
+      "split": "val",
+      "notes": "Open-PR + body, near-domain paraphrase."
+    },
+    {
+      "id": "fire-07",
+      "query": "draft a pull request description for this branch",
+      "expected": "fire",
+      "split": "val",
+      "notes": "Long form of 'PR description'."
+    },
+    {
+      "id": "fire-08",
+      "query": "pls write the pull request body",
+      "expected": "fire",
+      "split": "val",
+      "notes": "Casual tone, long form."
+    },
+    {
+      "id": "fire-09",
+      "query": "fill in the PR template before we ship",
+      "expected": "fire",
+      "split": "val",
+      "notes": "Verbatim phrase wrapped in a longer sentence."
+    },
+    {
+      "id": "no-fire-01",
+      "query": "write a commit message for these changes",
+      "expected": "no_fire",
+      "split": "train",
+      "notes": "Adjacent intent (commit, not PR) -- common confusion."
+    },
+    {
+      "id": "no-fire-02",
+      "query": "summarize this branch as a release note",
+      "expected": "no_fire",
+      "split": "train",
+      "notes": "Adjacent intent (release notes, not PR body)."
+    },
+    {
+      "id": "no-fire-03",
+      "query": "draft a CHANGELOG entry for this work",
+      "expected": "no_fire",
+      "split": "train",
+      "notes": "Changelog work, not PR description."
+    },
+    {
+      "id": "no-fire-04",
+      "query": "open an issue describing this bug",
+      "expected": "no_fire",
+      "split": "train",
+      "notes": "Issue, not PR."
+    },
+    {
+      "id": "no-fire-05",
+      "query": "review this PR for me",
+      "expected": "no_fire",
+      "split": "train",
+      "notes": "Reviewing != describing."
+    },
+    {
+      "id": "no-fire-06",
+      "query": "summarize the diff for me",
+      "expected": "no_fire",
+      "split": "val",
+      "notes": "Diff summary is an INPUT to PR description skill, not the same task."
+    },
+    {
+      "id": "no-fire-07",
+      "query": "draft a code review comment",
+      "expected": "no_fire",
+      "split": "val",
+      "notes": "Review comment, not PR body."
+    },
+    {
+      "id": "no-fire-08",
+      "query": "write the design doc for this branch",
+      "expected": "no_fire",
+      "split": "val",
+      "notes": "Design doc lives upstream of PR; different artifact."
+    },
+    {
+      "id": "no-fire-09",
+      "query": "give me release notes for this milestone",
+      "expected": "no_fire",
+      "split": "val",
+      "notes": "Release notes, not PR body."
+    }
+  ]
+}
diff --git a/.github/skills/pr-description-skill/scripts/run_evals.py b/.github/skills/pr-description-skill/scripts/run_evals.py
new file mode 100644
index 000000000..c5056aaf1
--- /dev/null
+++ b/.github/skills/pr-description-skill/scripts/run_evals.py
@@ -0,0 +1,343 @@
+#!/usr/bin/env python3
+# ASCII-only. Runs the pr-description-skill evals suite.
+"""Run pr-description-skill evals.
+
+Two eval families are exercised:
+
+  * TRIGGER EVALS scoring the SKILL.md `description:` against
+    should-fire and should-not-fire queries via a deterministic
+    keyword/bigram matcher. The matcher is documented in the
+    README; it approximates dispatcher behavior without requiring
+    a live LLM.
+
+  * CONTENT EVALS scoring pre-recorded `with_skill` and
+    `without_skill` fixtures against per-scenario regex rubrics.
+    The delta between the two scores is reported per scenario.
+
+The runner is non-interactive, stdlib-only, and emits structured
+JSON on stdout (machine-readable summary) plus diagnostics on
+stderr. Exit codes:
+
+  0 = all gates met
+  1 = one or more gates failed
+  2 = runner error (manifest or fixture missing, parse error)
+
+Run from the worktree root:
+
+    python .apm/skills/pr-description-skill/scripts/run_evals.py
+
+Use --help for full options.
+"""
+from __future__ import annotations
+
+import argparse
+import datetime as _dt
+import json
+import re
+import sys
+from pathlib import Path
+from typing import Any
+
+SCHEMA_VERSION = 1
+SKILL_DIR = Path(__file__).resolve().parent.parent
+EVALS_DIR = SKILL_DIR / "evals"
+
+
+def _log(msg: str) -> None:
+    print(msg, file=sys.stderr)
+
+
+def _load_json(path: Path) -> dict[str, Any]:
+    try:
+        return json.loads(path.read_text(encoding="utf-8"))
+    except FileNotFoundError:
+        _log(f"[x] missing file: {path}")
+        sys.exit(2)
+    except json.JSONDecodeError as exc:
+        _log(f"[x] invalid JSON in {path}: {exc}")
+        sys.exit(2)
+
+
+# ---------------------------------------------------------------------------
+# trigger evals
+# ---------------------------------------------------------------------------
+
+def _normalize(text: str) -> str:
+    return re.sub(r"\s+", " ", text.lower()).strip()
+
+
+def score_trigger(query: str, manifest: dict[str, Any]) -> tuple[bool, dict[str, Any]]:
+    """Return (predicted_fire, diagnostic) for one query.
+
+    Rule (deterministic dispatcher approximation):
+      1. Lowercase + collapse whitespace.
+      2. If any phrase from `stop_list` appears verbatim, predict
+         no_fire (negative override beats everything else).
+      3. If any phrase from `trigger_keywords_primary` appears
+         verbatim, predict fire.
+      4. Otherwise count distinct `trigger_keywords_secondary`
+         tokens present; predict fire iff the count is >= 3 AND
+         at least one of {pr, pull} is present.
+    """
+    q = _normalize(query)
+
+    for stop in manifest["stop_list"]:
+        if stop in q:
+            return False, {"reason": "stop_list_hit", "match": stop}
+
+    for kw in manifest["trigger_keywords_primary"]:
+        if kw in q:
+            return True, {"reason": "primary_match", "match": kw}
+
+    sec = manifest["trigger_keywords_secondary"]
+    hits = sorted({k for k in sec if re.search(rf"\b{re.escape(k)}\b", q)})
+    has_pr = any(t in hits for t in ("pr", "pull"))
+    if has_pr and len(hits) >= 3:
+        return True, {"reason": "secondary_threshold", "hits": hits}
+
+    return False, {"reason": "no_match", "hits": hits}
+
+
+def run_trigger_evals(
+    manifest: dict[str, Any], split: str
+) -> dict[str, Any]:
+    triggers_path = EVALS_DIR / manifest["triggers_manifest"]
+    triggers = _load_json(triggers_path)["items"]
+
+    if split != "all":
+        triggers = [t for t in triggers if t["split"] == split]
+
+    rows: list[dict[str, Any]] = []
+    fire_total = fire_correct = 0
+    no_fire_total = no_fire_correct = 0
+
+    for item in triggers:
+        predicted_fire, diag = score_trigger(item["query"], manifest)
+        expected_fire = item["expected"] == "fire"
+        passed = predicted_fire == expected_fire
+
+        rows.append({
+            "id": item["id"],
+            "split": item["split"],
+            "query": item["query"],
+            "expected": item["expected"],
+            "predicted": "fire" if predicted_fire else "no_fire",
+            "passed": passed,
+            "diagnostic": diag,
+        })
+
+        if expected_fire:
+            fire_total += 1
+            if passed:
+                fire_correct += 1
+        else:
+            no_fire_total += 1
+            if passed:
+                no_fire_correct += 1
+
+    fire_rate = (fire_correct / fire_total) if fire_total else 0.0
+    no_fire_rate = (no_fire_correct / no_fire_total) if no_fire_total else 0.0
+
+    gates = manifest["gates"]["triggers"]
+    fire_gate_met = fire_rate >= gates["should_fire_rate_min"]
+    no_fire_gate_met = (
+        (no_fire_rate) >= (1.0 - gates["should_not_fire_rate_max"])
+    )
+
+    return {
+        "split": split,
+        "should_fire_correct_rate": fire_rate,
+        "should_not_fire_correct_rate": no_fire_rate,
+        "should_fire_gate": {
+            "min": gates["should_fire_rate_min"],
+            "met": fire_gate_met,
+        },
+        "should_not_fire_gate": {
+            "max_miss_rate": gates["should_not_fire_rate_max"],
+            "met": no_fire_gate_met,
+        },
+        "rows": rows,
+        "passed": fire_gate_met and no_fire_gate_met,
+    }
+
+
+# ---------------------------------------------------------------------------
+# content evals
+# ---------------------------------------------------------------------------
+
+def score_fixture(text: str, rubric: list[dict[str, Any]]) -> dict[str, Any]:
+    score = 0
+    anchors_hit: list[str] = []
+    anchors_missed: list[str] = []
+    for check in rubric:
+        pattern = re.compile(check["pattern"])
+        match = bool(pattern.search(text))
+        weight = int(check["weight"])
+        if match:
+            score += weight
+            anchors_hit.append(check["id"])
+        else:
+            if weight < 0:
+                # penalty pattern: not matching is GOOD, no score change
+                pass
+            else:
+                anchors_missed.append(check["id"])
+    return {"score": score, "anchors_hit": anchors_hit, "anchors_missed": anchors_missed}
+
+
+def run_content_eval(scenario_path: Path, manifest_root: Path) -> dict[str, Any]:
+    scenario = _load_json(scenario_path)
+
+    fixtures = scenario["fixtures"]
+    with_path = (scenario_path.parent / fixtures["with_skill"]).resolve()
+    without_path = (scenario_path.parent / fixtures["without_skill"]).resolve()
+
+    if not with_path.exists():
+        _log(f"[x] missing fixture: {with_path}")
+        sys.exit(2)
+    if not without_path.exists():
+        _log(f"[x] missing fixture: {without_path}")
+        sys.exit(2)
+
+    with_text = with_path.read_text(encoding="utf-8")
+    without_text = without_path.read_text(encoding="utf-8")
+
+    with_score = score_fixture(with_text, scenario["rubric"])
+    without_score = score_fixture(without_text, scenario["rubric"])
+
+    delta_score = with_score["score"] - without_score["score"]
+    delta_anchors = len(
+        set(with_score["anchors_hit"]) - set(without_score["anchors_hit"])
+    )
+
+    return {
+        "id": scenario["id"],
+        "summary": scenario["summary"],
+        "with_skill": with_score,
+        "without_skill": without_score,
+        "delta_score": delta_score,
+        "delta_anchors": delta_anchors,
+    }
+
+
+def run_content_evals(manifest: dict[str, Any]) -> dict[str, Any]:
+    results: list[dict[str, Any]] = []
+    delta_min = int(manifest["gates"]["content"]["delta_min_anchors"])
+    all_passed = True
+
+    for rel in manifest["content_manifests"]:
+        scenario_path = EVALS_DIR / rel
+        result = run_content_eval(scenario_path, EVALS_DIR)
+        result["passed"] = result["delta_anchors"] >= delta_min
+        if not result["passed"]:
+            all_passed = False
+        results.append(result)
+
+    return {
+        "delta_min_anchors": delta_min,
+        "scenarios": results,
+        "passed": all_passed,
+    }
+
+
+# ---------------------------------------------------------------------------
+# orchestration
+# ---------------------------------------------------------------------------
+
+def build_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(
+        prog="run_evals.py",
+        description="Run pr-description-skill evals (triggers + content).",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    p.add_argument(
+        "--filter",
+        choices=("all", "triggers", "content"),
+        default="all",
+        help="Eval family to run (default: all).",
+    )
+    p.add_argument(
+        "--split",
+        choices=("all", "train", "val"),
+        default="val",
+        help="Trigger-eval split to score for the gate (default: val, the ship gate).",
+    )
+    p.add_argument(
+        "--manifest",
+        default=str(EVALS_DIR / "evals.json"),
+        help="Path to evals manifest (default: <skill>/evals/evals.json).",
+    )
+    p.add_argument(
+        "--results-dir",
+        default=str(EVALS_DIR / "results"),
+        help="Directory to write timestamped result JSON.",
+    )
+    p.add_argument(
+        "--quiet",
+        action="store_true",
+        help="Suppress stderr diagnostics.",
+    )
+    p.add_argument(
+        "--no-write",
+        action="store_true",
+        help="Do not write a result file to --results-dir.",
+    )
+    return p
+
+
+def main(argv: list[str] | None = None) -> int:
+    args = build_parser().parse_args(argv)
+
+    if args.quiet:
+        global _log
+        _log = lambda msg: None  # noqa: E731
+
+    manifest_path = Path(args.manifest)
+    manifest = _load_json(manifest_path)
+    if manifest.get("schema_version") != SCHEMA_VERSION:
+        _log(
+            f"[!] schema_version mismatch: manifest={manifest.get('schema_version')} "
+            f"runner={SCHEMA_VERSION}"
+        )
+
+    summary: dict[str, Any] = {
+        "schema_version": SCHEMA_VERSION,
+        "skill": manifest.get("skill"),
+        "timestamp_utc": _dt.datetime.now(tz=_dt.timezone.utc).isoformat(),
+        "filter": args.filter,
+        "split": args.split,
+    }
+
+    overall_passed = True
+
+    if args.filter in ("all", "triggers"):
+        _log("[>] running trigger evals")
+        trig = run_trigger_evals(manifest, args.split)
+        summary["triggers"] = trig
+        if not trig["passed"]:
+            overall_passed = False
+
+    if args.filter in ("all", "content"):
+        _log("[>] running content evals")
+        cont = run_content_evals(manifest)
+        summary["content"] = cont
+        if not cont["passed"]:
+            overall_passed = False
+
+    summary["passed"] = overall_passed
+
+    if not args.no_write:
+        results_dir = Path(args.results_dir)
+        results_dir.mkdir(parents=True, exist_ok=True)
+        ts = summary["timestamp_utc"].replace(":", "-").replace("+00-00", "Z")
+        out_path = results_dir / f"{ts}.json"
+        out_path.write_text(json.dumps(summary, indent=2) + "\n", encoding="utf-8")
+        summary["result_file"] = str(out_path.relative_to(SKILL_DIR.parent.parent.parent))
+        _log(f"[+] wrote {out_path}")
+
+    print(json.dumps(summary, indent=2))
+    return 0 if overall_passed else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())