From d76c1aaa11dde15e1025592b3e9dfec933c580d2 Mon Sep 17 00:00:00 2001
From: Ralf Anton Beier <ralf_beier@me.com>
Date: Sun, 26 Apr 2026 08:42:49 +0200
Subject: [PATCH] =?UTF-8?q?feat:=20hardened=20AI=20review=20=E2=80=94=20st?=
 =?UTF-8?q?rict=20JSON,=20slop=20filter,=20quote-or-die,=20no-escape=20ver?=
 =?UTF-8?q?dict?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Why
The deployed AI review on PR #23 was textbook small-model slop: duplicated
content, generic platitudes, hallucinated risks (claimed "tokens" in a
config-flip PR), zero file:line anchors. Root cause is the freeform 5-section
prompt (`src/ai-review.js:336`) — it gave the 3B model permission to fill
each section with whatever sounded reviewer-ish.

Per the user's request: every escape hatch the model uses to avoid producing
concrete output must be made mechanically invalid.

## What
New module `src/ai-review-prompt.js` containing four locked-down primitives:

| Primitive | Job |
|---|---|
| `STRICT_SYSTEM_PROMPT` | Forces JSON output, banned hedging words, banned slop phrases, NEVER refuse. Includes a one-shot example to anchor format. |
| `tryParseReview()` | Strict shape enforcement: enum verdict, array findings, all fields typed. On parse failure, return `{ok: false}` — no retry. |
| `filterFindings()` | Quote-or-die: drop any finding whose `quoted_line` isn't verbatim in the diff. Slop filter: drop any claim with hedging words or known filler phrases. |
| `computeVerdict()` | Deterministic from filtered findings (`approve` if empty, `comment` if any). Model's verdict is advisory only — postprocessor decides. |

`renderReviewMarkdown()` returns `null` when verdict is `approve` and
findings are empty — the bot then does NOT post. Silence > slop.

## Behaviour change
- Default `ai_review.system_prompt` flipped from the freeform 5-section
  string to `STRICT_SYSTEM_PROMPT`. Users with `system_prompt:` set
  explicitly in `config.local.yml` continue using the legacy freeform path
  (back-compat for existing deployments).
- New review records carry `assessment` and `findings` count → dashboard's
  `getReviewStats()` finally has real data instead of `unknown`.

## Test plan
- [x] All 737 tests pass (was 698 — added 39 covering parser, slop filter,
      verdict computation, render-skip behavior)
- [x] eslint clean
- [ ] After merge + self-update: AI review on the next PR should produce
      one of: (a) a tight comment with verdict + summary + file:line-anchored
      findings, or (b) no comment at all (when nothing concrete to flag).
- [ ] Verify on netcup logs that the JSON parse rate is high — if Ollama
      keeps producing prose around the JSON, may need to tighten the
      `temperature` config (currently 0.3) or add an assistant-prefix
      pre-fill.

## Risk & rollout
- Risk: low. Worst case (model produces unparseable output every time) is
  silent — no comments posted. Better than the current state where every
  comment is slop.
- Rollout: self-update on merge. Confirmation = the next non-bot PR opened
  in any pulseengine repo gets either a strict-format review or no review.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md                            |  19 ++
 __tests__/integration/ai-review.test.js |  14 +-
 __tests__/unit/ai-review-prompt.test.js | 257 ++++++++++++++++++++++++
 src/ai-review-prompt.js                 | 221 ++++++++++++++++++++
 src/ai-review.js                        |  78 ++++++-
 5 files changed, 577 insertions(+), 12 deletions(-)
 create mode 100644 __tests__/unit/ai-review-prompt.test.js
 create mode 100644 src/ai-review-prompt.js

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a1c60b7..4b7a789 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,25 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Added — AI review hardening (PR-A)
+- **`src/ai-review-prompt.js`**: strict-JSON contract + parser + slop filter +
+  quote-or-die + verdict-from-findings. New default for AI review on PRs.
+  - System prompt forbids hedging words (`might`, `could`, `should`, etc.)
+    and slop phrases ("ensure proper error handling", "consider edge cases",
+    etc.) in finding claims.
+  - `tryParseReview()` enforces a strict JSON shape — unknown verdict, null
+    findings, missing fields all reject. No retries: silence is preferred to
+    slop.
+  - `filterFindings()` drops claims whose `quoted_line` doesn't appear
+    verbatim in the diff (kills paraphrase/hallucination) or whose `claim`
+    contains hedging.
+  - `computeVerdict()` is deterministic: empty findings → `approve`; any
+    finding → `comment`. The model's verdict field is advisory only.
+  - When verdict is `approve` and findings are empty, the bot does NOT post
+    — silence is better than a noisy "looks good" comment.
+- Existing `ai_review.system_prompt` config still works as a back-door to the
+  legacy freeform prompt for users who explicitly override it.
+
 ### Added
 - **Repository Rulesets** support (`src/rulesets.js`). Branch protection now uses
   the modern Rulesets API (`POST /repos/{owner}/{repo}/rulesets`) which targets
diff --git a/__tests__/integration/ai-review.test.js b/__tests__/integration/ai-review.test.js
index 06d8bee..47608a7 100644
--- a/__tests__/integration/ai-review.test.js
+++ b/__tests__/integration/ai-review.test.js
@@ -92,6 +92,10 @@ function makeAiConfig(overrides = {}) {
     max_tokens: 2000,
     temperature: 0.3,
     timeout: 120000,
+    // Legacy freeform mode — these tests pre-date the strict-JSON contract
+    // and use plain-text mock responses. The new strict-mode path has its
+    // own dedicated tests in __tests__/unit/ai-review-prompt.test.js.
+    system_prompt: 'You are a thorough code reviewer. Analyze the PR diff.',
     ...overrides
   };
 }
@@ -802,7 +806,7 @@ describe('ai-review', () => {
       expect(timestamp).toBeLessThanOrEqual(afterTime);
     });
 
-    it('uses default system prompt when none configured', async () => {
+    it('uses the strict-JSON system prompt when none configured', async () => {
       _setConfigForTesting({
         ai_review: makeAiConfig({ system_prompt: undefined })
       });
@@ -812,11 +816,15 @@ describe('ai-review', () => {
         .mockResolvedValueOnce({ data: 'some diff' })
         .mockResolvedValueOnce({ data: [] });
 
-      mockFetchSuccess('review');
+      // Strict mode parses JSON; freeform 'review' would fail to parse, so we
+      // give it a valid empty review.
+      mockFetchSuccess('{"verdict":"approve","summary":"x","findings":[]}');
       await reviewPullRequest(octokit, 'owner', 'repo', 1);
 
       const fetchBody = JSON.parse(global.fetch.mock.calls[0][1].body);
-      expect(fetchBody.messages[0].content).toContain('thorough code reviewer');
+      // Default prompt is the strict-JSON contract, not the legacy freeform one.
+      expect(fetchBody.messages[0].content).toContain('STRICT JSON');
+      expect(fetchBody.messages[0].content).toContain('NEVER refuse');
     });
 
     it('uses custom system prompt when configured', async () => {
diff --git a/__tests__/unit/ai-review-prompt.test.js b/__tests__/unit/ai-review-prompt.test.js
new file mode 100644
index 0000000..c66d980
--- /dev/null
+++ b/__tests__/unit/ai-review-prompt.test.js
@@ -0,0 +1,257 @@
+import {
+  STRICT_SYSTEM_PROMPT,
+  isSlop,
+  tryParseReview,
+  filterFindings,
+  computeVerdict,
+  renderReviewMarkdown
+} from '../../src/ai-review-prompt.js';
+
+describe('STRICT_SYSTEM_PROMPT', () => {
+  it('includes the banned-word rule and example outputs', () => {
+    expect(STRICT_SYSTEM_PROMPT).toMatch(/FORBIDDEN words/);
+    expect(STRICT_SYSTEM_PROMPT).toMatch(/"verdict"\s*:\s*"approve"/);
+    expect(STRICT_SYSTEM_PROMPT).toMatch(/quoted_line/);
+    expect(STRICT_SYSTEM_PROMPT).toMatch(/NEVER refuse/);
+  });
+});
+
+describe('isSlop', () => {
+  it.each([
+    'consider edge cases here',
+    'developer should validate input',
+    'this might lead to issues',
+    'could possibly fail in some cases',
+    'ensure proper error handling',
+    'in general, this is fine',
+    'more context is needed',
+    'unable to determine without seeing other files',
+    'you should consider refactoring',
+    'enhance validation around the boundary',
+  ])('flags hedging/slop: %s', (s) => {
+    expect(isSlop(s)).toBe(true);
+  });
+
+  it.each([
+    'No new test asserts that a 4xx status survives wrap_full_page.',
+    'config.yml:165 sets pr_labels including "dependencies" but the PR adds no actual dep update.',
+    'src/foo.js:42 dereferences userId before validating req.body is an object.',
+    'The mutex is acquired but never released on the early-return path at line 88.',
+  ])('passes a concrete claim: %s', (s) => {
+    expect(isSlop(s)).toBe(false);
+  });
+
+  it('treats non-string as slop', () => {
+    expect(isSlop(undefined)).toBe(true);
+    expect(isSlop(null)).toBe(true);
+    expect(isSlop(42)).toBe(true);
+  });
+});
+
+describe('tryParseReview', () => {
+  const happy = JSON.stringify({
+    verdict: 'comment',
+    summary: 'Adds a thing.',
+    findings: [
+      { file: 'src/a.js', line: 10, quoted_line: '+const x = 1;', claim: 'no test for x' }
+    ]
+  });
+
+  it('parses a clean JSON response', () => {
+    const r = tryParseReview(happy);
+    expect(r.ok).toBe(true);
+    expect(r.data.verdict).toBe('comment');
+    expect(r.data.findings).toHaveLength(1);
+  });
+
+  it('strips markdown code fences', () => {
+    const r = tryParseReview('```json\n' + happy + '\n```');
+    expect(r.ok).toBe(true);
+    expect(r.data.verdict).toBe('comment');
+  });
+
+  it('extracts a JSON object embedded in prose', () => {
+    const r = tryParseReview('Here you go:\n\n' + happy + '\n\nThanks!');
+    expect(r.ok).toBe(true);
+  });
+
+  it('rejects empty responses', () => {
+    expect(tryParseReview('').ok).toBe(false);
+    expect(tryParseReview('   ').ok).toBe(false);
+  });
+
+  it('rejects invalid JSON with no recoverable object', () => {
+    expect(tryParseReview('not json at all').ok).toBe(false);
+  });
+
+  it('rejects unknown verdict values', () => {
+    const bad = JSON.stringify({ verdict: 'maybe', summary: 'x', findings: [] });
+    const r = tryParseReview(bad);
+    expect(r.ok).toBe(false);
+    expect(r.error).toMatch(/invalid verdict/);
+  });
+
+  it('rejects null findings (would-be escape hatch)', () => {
+    const bad = JSON.stringify({ verdict: 'approve', summary: 'x', findings: null });
+    const r = tryParseReview(bad);
+    expect(r.ok).toBe(false);
+    expect(r.error).toMatch(/findings must be an array/);
+  });
+
+  it('rejects findings with missing fields', () => {
+    const bad = JSON.stringify({
+      verdict: 'comment',
+      summary: 'x',
+      findings: [{ file: 'a', claim: 'b' }]  // missing line, quoted_line
+    });
+    expect(tryParseReview(bad).ok).toBe(false);
+  });
+
+  it('rejects findings with wrong-type fields', () => {
+    const bad = JSON.stringify({
+      verdict: 'comment',
+      summary: 'x',
+      findings: [{ file: 'a', line: 'ten', quoted_line: 'x', claim: 'y' }]
+    });
+    expect(tryParseReview(bad).ok).toBe(false);
+  });
+
+  it('rejects when verdict is missing', () => {
+    const bad = JSON.stringify({ summary: 'x', findings: [] });
+    expect(tryParseReview(bad).ok).toBe(false);
+  });
+
+  it('rejects an array (not object) as top-level', () => {
+    expect(tryParseReview('[]').ok).toBe(false);
+  });
+});
+
+describe('filterFindings (quote-or-die + slop drop)', () => {
+  const diff = `
+diff --git a/src/foo.js b/src/foo.js
++++ b/src/foo.js
+@@ -1,3 +1,5 @@
++const id = req.body.userId;
++console.log("hi");
+`;
+
+  it('keeps findings whose quoted_line appears in diff and claim is concrete', () => {
+    const findings = [{
+      file: 'src/foo.js',
+      line: 1,
+      quoted_line: '+const id = req.body.userId;',
+      claim: 'src/foo.js:1 trusts req.body.userId without validating it is a string.'
+    }];
+    expect(filterFindings(findings, diff)).toHaveLength(1);
+  });
+
+  it('drops findings whose quoted_line is not in the diff (paraphrase / hallucination)', () => {
+    const findings = [{
+      file: 'src/foo.js',
+      line: 99,
+      quoted_line: '+const userId = request.body.user_id;',  // close but not in diff
+      claim: 'unvalidated input'
+    }];
+    expect(filterFindings(findings, diff)).toHaveLength(0);
+  });
+
+  it('drops findings with hedging language', () => {
+    const findings = [{
+      file: 'src/foo.js',
+      line: 1,
+      quoted_line: '+const id = req.body.userId;',
+      claim: 'this might be an issue, consider validating it'
+    }];
+    expect(filterFindings(findings, diff)).toHaveLength(0);
+  });
+
+  it('drops findings with classic slop phrases', () => {
+    const findings = [{
+      file: 'src/foo.js',
+      line: 1,
+      quoted_line: '+const id = req.body.userId;',
+      claim: 'developer should ensure proper error handling around this line'
+    }];
+    expect(filterFindings(findings, diff)).toHaveLength(0);
+  });
+
+  it('returns [] for non-array input', () => {
+    expect(filterFindings(null, diff)).toEqual([]);
+    expect(filterFindings(undefined, diff)).toEqual([]);
+  });
+
+  it('skips quote check when diff is empty (test-time helper)', () => {
+    const findings = [{
+      file: 'a', line: 1, quoted_line: 'whatever', claim: 'no slop here at line 1'
+    }];
+    expect(filterFindings(findings, '')).toHaveLength(1);
+  });
+});
+
+describe('computeVerdict (deterministic, ignores model)', () => {
+  it('returns approve for empty findings', () => {
+    expect(computeVerdict([])).toBe('approve');
+    expect(computeVerdict(undefined)).toBe('approve');
+    expect(computeVerdict(null)).toBe('approve');
+  });
+
+  it('returns comment when findings exist', () => {
+    expect(computeVerdict([{ file: 'a', line: 1, quoted_line: 'x', claim: 'y' }])).toBe('comment');
+  });
+});
+
+describe('renderReviewMarkdown', () => {
+  const meta = {
+    baseRepo: 'org/repo', baseBranch: 'main',
+    headRepo: 'org/repo', headBranch: 'fix/x'
+  };
+
+  it('returns null when verdict is approve and findings empty (silence preferred)', () => {
+    const out = renderReviewMarkdown(
+      { verdict: 'approve', summary: 'Doc only.', findings: [] },
+      42, 'abc1234', meta
+    );
+    expect(out).toBeNull();
+  });
+
+  it('renders verdict + summary + findings when findings exist', () => {
+    const out = renderReviewMarkdown(
+      {
+        verdict: 'comment',
+        summary: 'Adds X.',
+        findings: [{ file: 'a.js', line: 5, quoted_line: '+x', claim: 'no test for x at a.js:5' }]
+      },
+      42, 'abc1234', meta
+    );
+    expect(out).toContain('AI Code Review for PR #42');
+    expect(out).toContain('💬 Comment');
+    expect(out).toContain('Adds X.');
+    expect(out).toContain('`a.js:5`');
+    expect(out).toContain('no test for x');
+    expect(out).toContain('abc1234');
+  });
+
+  it('forces verdict from findings even if model claimed otherwise', () => {
+    // Model says approve but provides findings — verdict must be comment.
+    const out = renderReviewMarkdown(
+      {
+        verdict: 'approve',  // ← model lied
+        summary: 'Adds X.',
+        findings: [{ file: 'a.js', line: 5, quoted_line: '+x', claim: 'no test for x at a.js:5' }]
+      },
+      42, 'abc1234', meta
+    );
+    expect(out).toContain('💬 Comment');
+    expect(out).not.toContain('✅ Approve');
+  });
+
+  it('still posts when caller opts out of skip-approve-empty', () => {
+    const out = renderReviewMarkdown(
+      { verdict: 'approve', summary: 'Doc only.', findings: [] },
+      42, 'abc1234', meta,
+      { skipApproveEmpty: false }
+    );
+    expect(out).toContain('✅ Approve');
+    expect(out).toContain('**Findings:** None.');
+  });
+});
diff --git a/src/ai-review-prompt.js b/src/ai-review-prompt.js
new file mode 100644
index 0000000..f2a9a7d
--- /dev/null
+++ b/src/ai-review-prompt.js
@@ -0,0 +1,221 @@
+/**
+ * Hardened AI review prompt + parser.
+ *
+ * Small local models (qwen2.5-coder:3b on this deployment) love escape
+ * hatches: hedging, refusal, filler. The previous freeform prompt produced
+ * exactly that — see PR #23's review comment for the textbook example.
+ *
+ * This module locks the model into a strict JSON contract and enforces the
+ * contract with a deterministic parser + postprocessor:
+ *
+ *   1. System prompt forbids hedging, refusal, filler. Demands JSON only.
+ *   2. tryParseReview() rejects malformed output. No retry: silence is fine.
+ *   3. filterFindings() drops anything that hedges OR fails quote-or-die.
+ *   4. computeVerdict() ignores the model's verdict; computes from findings.
+ *
+ * Rule of thumb: every finding that survives this pipeline must be
+ * (a) anchored to a verbatim line in the diff, (b) phrased as a concrete
+ * claim, (c) post-checked by a parser that can't be talked out of saying no.
+ */
+
+export const STRICT_SYSTEM_PROMPT = `You are reviewing a pull request. Output STRICT JSON only.
+
+Rules:
+1. Output MUST be valid JSON matching the schema below. NO prose before or after. NO markdown fences. The parser DISCARDS any extra text.
+2. "verdict" MUST be one of: "approve", "comment", "request_changes". NO other values.
+3. "findings" MUST be an array. Use [] if empty. NEVER null. NEVER omit the field.
+4. Each finding's "quoted_line" MUST be copied VERBATIM from the diff (after the +/- marker, exactly as shown). Paraphrases are REJECTED.
+5. Each finding's "claim" MUST be a direct declarative sentence. FORBIDDEN words in claims: "might", "could", "may", "possibly", "perhaps", "consider", "should", "generally", "typically".
+6. FORBIDDEN phrases in claims (these add no value): "ensure proper error handling", "consider edge cases", "implement secure practices", "more context", "developer should", "in general", "improve robustness", "enhance validation".
+7. If you have nothing concrete to flag, return "findings": []. DO NOT pad. DO NOT explain.
+8. NEVER refuse. NEVER say "I cannot determine" or "insufficient information". If unsure, return "findings": [].
+
+Schema:
+{
+  "verdict": "approve" | "comment" | "request_changes",
+  "summary": "<1-2 sentences describing what the diff does, max 200 chars>",
+  "findings": [
+    {
+      "file": "<path/to/file from the diff>",
+      "line": <integer line number>,
+      "quoted_line": "<verbatim line copied from the diff>",
+      "claim": "<concrete falsifiable assertion, max 200 chars>"
+    }
+  ]
+}
+
+Example with a finding:
+{"verdict":"comment","summary":"Adds status preservation in wrap_full_page middleware.","findings":[{"file":"rivet-cli/src/serve/mod.rs","line":1041,"quoted_line":"        *wrapped.status_mut() = status;","claim":"No new test asserts that a 4xx status survives wrap_full_page; serve_integration tests check 200 paths only."}]}
+
+Example with no findings:
+{"verdict":"approve","summary":"Pure docs change in README.","findings":[]}`;
+
+const HEDGE_OR_SLOP_PATTERNS = [
+  /\bmight\b/i,
+  /\bcould\b/i,
+  /\bmay\b/i,
+  /\bpossibly\b/i,
+  /\bperhaps\b/i,
+  /\bgenerally\b/i,
+  /\btypically\b/i,
+  /\bconsider\s+/i,
+  /\bshould\s+/i,
+  /developer should/i,
+  /ensure proper error handling/i,
+  /consider edge cases/i,
+  /implement secure practices/i,
+  /(more|additional)\s+context/i,
+  /(unable|insufficient)\s+(to|context)/i,
+  /improve (robustness|validation)/i,
+  /enhance (the )?(schema|validation)/i,
+  /review and refactor/i,
+  /in (most )?cases/i,
+  /in general/i
+];
+
+export function isSlop(text) {
+  if (typeof text !== 'string') return true;
+  return HEDGE_OR_SLOP_PATTERNS.some((re) => re.test(text));
+}
+
+const ALLOWED_VERDICTS = new Set(['approve', 'comment', 'request_changes']);
+
+/**
+ * Parse the model's raw text into a structured review.
+ * Forgiving on incidental wrapping (markdown fences, leading/trailing prose);
+ * strict on shape (required fields, types, enum verdict).
+ *
+ * @returns {{ok: true, data: object} | {ok: false, error: string}}
+ */
+export function tryParseReview(rawText) {
+  if (typeof rawText !== 'string' || !rawText.trim()) {
+    return { ok: false, error: 'empty response' };
+  }
+
+  const candidate = rawText.trim()
+    .replace(/^```(?:json)?\s*/i, '')
+    .replace(/\s*```\s*$/, '');
+
+  let parsed;
+  try {
+    parsed = JSON.parse(candidate);
+  } catch {
+    const match = candidate.match(/\{[\s\S]*\}/);
+    if (!match) return { ok: false, error: 'no JSON object in response' };
+    try {
+      parsed = JSON.parse(match[0]);
+    } catch (e) {
+      return { ok: false, error: `JSON parse failed: ${e.message}` };
+    }
+  }
+
+  if (parsed === null || typeof parsed !== 'object' || Array.isArray(parsed)) {
+    return { ok: false, error: 'not a JSON object' };
+  }
+  if (!ALLOWED_VERDICTS.has(parsed.verdict)) {
+    return { ok: false, error: `invalid verdict: ${JSON.stringify(parsed.verdict)}` };
+  }
+  if (typeof parsed.summary !== 'string') {
+    return { ok: false, error: 'summary must be a string' };
+  }
+  if (!Array.isArray(parsed.findings)) {
+    return { ok: false, error: 'findings must be an array (use [] when empty)' };
+  }
+  for (const f of parsed.findings) {
+    if (
+      f === null ||
+      typeof f !== 'object' ||
+      typeof f.file !== 'string' ||
+      typeof f.line !== 'number' ||
+      typeof f.quoted_line !== 'string' ||
+      typeof f.claim !== 'string'
+    ) {
+      return { ok: false, error: 'finding has missing or wrong-type fields' };
+    }
+  }
+  return { ok: true, data: parsed };
+}
+
+/**
+ * Drop findings whose claim hedges/slops, or whose quoted_line doesn't appear
+ * verbatim in the diff (the quote-or-die rule).
+ */
+export function filterFindings(findings, diffText) {
+  if (!Array.isArray(findings)) return [];
+  return findings.filter((f) => {
+    if (isSlop(f.claim)) return false;
+    if (typeof diffText === 'string' && diffText.length > 0) {
+      const needle = f.quoted_line.trim();
+      if (needle && !diffText.includes(needle)) return false;
+    }
+    return true;
+  });
+}
+
+/**
+ * Compute the verdict deterministically from validated findings.
+ * The model's `verdict` is advisory; this function decides.
+ */
+export function computeVerdict(findings) {
+  if (!Array.isArray(findings) || findings.length === 0) return 'approve';
+  return 'comment';
+}
+
+/**
+ * Render the parsed + filtered review as a Markdown comment.
+ * Returns null when the review should not be posted at all (no findings,
+ * verdict approve) — silence is preferable to slop.
+ */
+export function renderReviewMarkdown(parsed, prNumber, headSha, meta, opts = {}) {
+  const { skipApproveEmpty = true } = opts;
+  const findings = parsed.findings || [];
+  const verdict = computeVerdict(findings);
+
+  if (skipApproveEmpty && verdict === 'approve' && findings.length === 0) {
+    return null;
+  }
+
+  const verdictBadge = {
+    approve: '✅ Approve',
+    comment: '💬 Comment',
+    request_changes: '🔴 Request changes'
+  }[verdict];
+
+  const lines = [];
+  lines.push(`## AI Code Review for PR #${prNumber}`);
+  if (meta?.headRepo && meta?.baseBranch && meta?.headBranch && meta?.baseRepo) {
+    lines.push(`${meta.headRepo}:\`${meta.headBranch}\` → ${meta.baseRepo}:\`${meta.baseBranch}\``);
+  }
+  lines.push('');
+  lines.push(`**Verdict:** ${verdictBadge}`);
+  lines.push('');
+  lines.push(`**Summary:** ${parsed.summary || '(none)'}`);
+  lines.push('');
+
+  if (findings.length === 0) {
+    lines.push('**Findings:** None.');
+  } else {
+    lines.push(`**Findings (${findings.length}):**`);
+    lines.push('');
+    findings.forEach((f, i) => {
+      lines.push(`${i + 1}. \`${f.file}:${f.line}\``);
+      lines.push('   ```');
+      lines.push(`   ${f.quoted_line}`);
+      lines.push('   ```');
+      lines.push(`   ${f.claim}`);
+      lines.push('');
+    });
+  }
+
+  lines.push('---');
+  lines.push(
+    '*Generated by a local AI model and post-validated against a strict JSON contract. ' +
+    'Each finding includes the verbatim line being criticised — verify by reading the file at the cited location.*'
+  );
+  if (headSha) {
+    lines.push('');
+    lines.push(`*Reviewed at \`${headSha.substring(0, 7)}\`*`);
+  }
+
+  return lines.join('\n');
+}
diff --git a/src/ai-review.js b/src/ai-review.js
index 7a742ac..bf18e9c 100644
--- a/src/ai-review.js
+++ b/src/ai-review.js
@@ -3,6 +3,13 @@ import path from 'path';
 import { fileURLToPath } from 'url';
 import { getConfig } from './config.js';
 import { getLogger } from './logger.js';
+import {
+  STRICT_SYSTEM_PROMPT,
+  tryParseReview,
+  filterFindings,
+  computeVerdict,
+  renderReviewMarkdown
+} from './ai-review-prompt.js';
 
 const _reviewTimestamps = new Map();
 
@@ -364,13 +371,15 @@ async function reviewPullRequest(octokit, owner, repo, prNumber) {
       { owner, repo, pull_number: prNumber }
     );
 
-    const systemPrompt =
-      aiConfig.system_prompt ||
-      'You are a thorough code reviewer. Analyze the PR diff and provide: 1. Summary of changes 2. Potential bugs or issues 3. Security concerns 4. Suggestions for improvement 5. Overall assessment';
+    // Strict-JSON contract by default; legacy freeform if user explicitly
+    // overrides system_prompt in config.
+    const systemPrompt = aiConfig.system_prompt || STRICT_SYSTEM_PROMPT;
+    const useStrictMode = !aiConfig.system_prompt;
 
+    const diffText = typeof diffResponse.data === 'string' ? diffResponse.data : '';
     const userPrompt = buildReviewPrompt(
       prData.data,
-      typeof diffResponse.data === 'string' ? diffResponse.data : '',
+      diffText,
       filesResponse.data,
       aiConfig.max_diff_size || 12000
     );
@@ -390,12 +399,62 @@ async function reviewPullRequest(octokit, owner, repo, prNumber) {
     recordReview(rateKey);
 
     const headSha = prData.data.head?.sha || '';
-    const comment = formatReviewComment(aiResponse, prNumber, headSha, {
+    const meta = {
       baseRepo: prData.data.base?.repo?.full_name || `${owner}/${repo}`,
       baseBranch: prData.data.base?.ref || '',
       headRepo: prData.data.head?.repo?.full_name || `${owner}/${repo}`,
       headBranch: prData.data.head?.ref || '',
-    });
+    };
+
+    let comment;
+    let assessment = 'unknown';
+    let findingsCount = 0;
+
+    if (useStrictMode) {
+      const parsed = tryParseReview(aiResponse);
+      if (!parsed.ok) {
+        getLogger().warn(
+          { pr: prNumber, error: parsed.error, sample: aiResponse.slice(0, 200) },
+          'AI review JSON parse failed — skipping post (silence is better than slop)'
+        );
+        return { success: false, error: `parse: ${parsed.error}`, skipped: true };
+      }
+
+      const filtered = filterFindings(parsed.data.findings, diffText);
+      const droppedCount = parsed.data.findings.length - filtered.length;
+      if (droppedCount > 0) {
+        getLogger().info(
+          { pr: prNumber, droppedCount, kept: filtered.length },
+          'Filtered out hedging/unquoted findings'
+        );
+      }
+
+      const renderInput = { ...parsed.data, findings: filtered };
+      comment = renderReviewMarkdown(renderInput, prNumber, headSha, meta);
+      assessment = computeVerdict(filtered);
+      findingsCount = filtered.length;
+
+      if (comment === null) {
+        getLogger().info(
+          { pr: prNumber },
+          'AI review approved with no findings — skipping post'
+        );
+        storeReview({
+          repo: `${owner}/${repo}`,
+          prNumber,
+          headSha: headSha.substring(0, 7),
+          timestamp: new Date().toISOString(),
+          status: 'open',
+          assessment,
+          findings: 0
+        });
+        return { success: true, skipped: true, assessment };
+      }
+    } else {
+      // Legacy freeform path — preserved for users who set system_prompt
+      // explicitly. No structural validation; old footer/format applied.
+      comment = formatReviewComment(aiResponse, prNumber, headSha, meta);
+    }
 
     // Mark previous bot reviews as outdated
     await supersedePreviousReviews(octokit, owner, repo, prNumber);
@@ -405,16 +464,17 @@ async function reviewPullRequest(octokit, owner, repo, prNumber) {
       { owner, repo, issue_number: prNumber, body: comment }
     );
 
-    // Store the review record
     storeReview({
       repo: `${owner}/${repo}`,
       prNumber,
       headSha: headSha.substring(0, 7),
       timestamp: new Date().toISOString(),
-      status: 'open'
+      status: 'open',
+      assessment,
+      findings: findingsCount
     });
 
-    return { success: true, comment };
+    return { success: true, comment, assessment, findings: findingsCount };
   } catch (error) {
     getLogger().error(`❌ Error reviewing PR #${prNumber}:`, error.message);
     return { success: false, error: error.message };