Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,25 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Added — AI review hardening (PR-A)
- **`src/ai-review-prompt.js`**: strict-JSON contract + parser + slop filter +
quote-or-die + verdict-from-findings. New default for AI review on PRs.
- System prompt forbids hedging words (`might`, `could`, `should`, etc.)
and slop phrases ("ensure proper error handling", "consider edge cases",
etc.) in finding claims.
- `tryParseReview()` enforces a strict JSON shape — unknown verdict, null
findings, missing fields all reject. No retries: silence is preferred to
slop.
- `filterFindings()` drops claims whose `quoted_line` doesn't appear
verbatim in the diff (kills paraphrase/hallucination) or whose `claim`
contains hedging.
- `computeVerdict()` is deterministic: empty findings → `approve`; any
finding → `comment`. The model's verdict field is advisory only.
- When verdict is `approve` and findings are empty, the bot does NOT post
— silence is better than a noisy "looks good" comment.
- Existing `ai_review.system_prompt` config still works as a back-door to the
legacy freeform prompt for users who explicitly override it.

### Added
- **Repository Rulesets** support (`src/rulesets.js`). Branch protection now uses
the modern Rulesets API (`POST /repos/{owner}/{repo}/rulesets`) which targets
Expand Down
14 changes: 11 additions & 3 deletions __tests__/integration/ai-review.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,10 @@ function makeAiConfig(overrides = {}) {
max_tokens: 2000,
temperature: 0.3,
timeout: 120000,
// Legacy freeform mode — these tests pre-date the strict-JSON contract
// and use plain-text mock responses. The new strict-mode path has its
// own dedicated tests in __tests__/unit/ai-review-prompt.test.js.
system_prompt: 'You are a thorough code reviewer. Analyze the PR diff.',
...overrides
};
}
Expand Down Expand Up @@ -802,7 +806,7 @@ describe('ai-review', () => {
expect(timestamp).toBeLessThanOrEqual(afterTime);
});

it('uses default system prompt when none configured', async () => {
it('uses the strict-JSON system prompt when none configured', async () => {
_setConfigForTesting({
ai_review: makeAiConfig({ system_prompt: undefined })
});
Expand All @@ -812,11 +816,15 @@ describe('ai-review', () => {
.mockResolvedValueOnce({ data: 'some diff' })
.mockResolvedValueOnce({ data: [] });

mockFetchSuccess('review');
// Strict mode parses JSON; freeform 'review' would fail to parse, so we
// give it a valid empty review.
mockFetchSuccess('{"verdict":"approve","summary":"x","findings":[]}');
await reviewPullRequest(octokit, 'owner', 'repo', 1);

const fetchBody = JSON.parse(global.fetch.mock.calls[0][1].body);
expect(fetchBody.messages[0].content).toContain('thorough code reviewer');
// Default prompt is the strict-JSON contract, not the legacy freeform one.
expect(fetchBody.messages[0].content).toContain('STRICT JSON');
expect(fetchBody.messages[0].content).toContain('NEVER refuse');
});

it('uses custom system prompt when configured', async () => {
Expand Down
257 changes: 257 additions & 0 deletions __tests__/unit/ai-review-prompt.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,257 @@
import {
STRICT_SYSTEM_PROMPT,
isSlop,
tryParseReview,
filterFindings,
computeVerdict,
renderReviewMarkdown
} from '../../src/ai-review-prompt.js';

describe('STRICT_SYSTEM_PROMPT', () => {
it('includes the banned-word rule and example outputs', () => {
expect(STRICT_SYSTEM_PROMPT).toMatch(/FORBIDDEN words/);
expect(STRICT_SYSTEM_PROMPT).toMatch(/"verdict"\s*:\s*"approve"/);
expect(STRICT_SYSTEM_PROMPT).toMatch(/quoted_line/);
expect(STRICT_SYSTEM_PROMPT).toMatch(/NEVER refuse/);
});
});

describe('isSlop', () => {
it.each([
'consider edge cases here',
'developer should validate input',
'this might lead to issues',
'could possibly fail in some cases',
'ensure proper error handling',
'in general, this is fine',
'more context is needed',
'unable to determine without seeing other files',
'you should consider refactoring',
'enhance validation around the boundary',
])('flags hedging/slop: %s', (s) => {
expect(isSlop(s)).toBe(true);
});

it.each([
'No new test asserts that a 4xx status survives wrap_full_page.',
'config.yml:165 sets pr_labels including "dependencies" but the PR adds no actual dep update.',
'src/foo.js:42 dereferences userId before validating req.body is an object.',
'The mutex is acquired but never released on the early-return path at line 88.',
])('passes a concrete claim: %s', (s) => {
expect(isSlop(s)).toBe(false);
});

it('treats non-string as slop', () => {
expect(isSlop(undefined)).toBe(true);
expect(isSlop(null)).toBe(true);
expect(isSlop(42)).toBe(true);
});
});

describe('tryParseReview', () => {
const happy = JSON.stringify({
verdict: 'comment',
summary: 'Adds a thing.',
findings: [
{ file: 'src/a.js', line: 10, quoted_line: '+const x = 1;', claim: 'no test for x' }
]
});

it('parses a clean JSON response', () => {
const r = tryParseReview(happy);
expect(r.ok).toBe(true);
expect(r.data.verdict).toBe('comment');
expect(r.data.findings).toHaveLength(1);
});

it('strips markdown code fences', () => {
const r = tryParseReview('```json\n' + happy + '\n```');
expect(r.ok).toBe(true);
expect(r.data.verdict).toBe('comment');
});

it('extracts a JSON object embedded in prose', () => {
const r = tryParseReview('Here you go:\n\n' + happy + '\n\nThanks!');
expect(r.ok).toBe(true);
});

it('rejects empty responses', () => {
expect(tryParseReview('').ok).toBe(false);
expect(tryParseReview(' ').ok).toBe(false);
});

it('rejects invalid JSON with no recoverable object', () => {
expect(tryParseReview('not json at all').ok).toBe(false);
});

it('rejects unknown verdict values', () => {
const bad = JSON.stringify({ verdict: 'maybe', summary: 'x', findings: [] });
const r = tryParseReview(bad);
expect(r.ok).toBe(false);
expect(r.error).toMatch(/invalid verdict/);
});

it('rejects null findings (would-be escape hatch)', () => {
const bad = JSON.stringify({ verdict: 'approve', summary: 'x', findings: null });
const r = tryParseReview(bad);
expect(r.ok).toBe(false);
expect(r.error).toMatch(/findings must be an array/);
});

it('rejects findings with missing fields', () => {
const bad = JSON.stringify({
verdict: 'comment',
summary: 'x',
findings: [{ file: 'a', claim: 'b' }] // missing line, quoted_line
});
expect(tryParseReview(bad).ok).toBe(false);
});

it('rejects findings with wrong-type fields', () => {
const bad = JSON.stringify({
verdict: 'comment',
summary: 'x',
findings: [{ file: 'a', line: 'ten', quoted_line: 'x', claim: 'y' }]
});
expect(tryParseReview(bad).ok).toBe(false);
});

it('rejects when verdict is missing', () => {
const bad = JSON.stringify({ summary: 'x', findings: [] });
expect(tryParseReview(bad).ok).toBe(false);
});

it('rejects an array (not object) as top-level', () => {
expect(tryParseReview('[]').ok).toBe(false);
});
});

describe('filterFindings (quote-or-die + slop drop)', () => {
const diff = `
diff --git a/src/foo.js b/src/foo.js
+++ b/src/foo.js
@@ -1,3 +1,5 @@
+const id = req.body.userId;
+console.log("hi");
`;

it('keeps findings whose quoted_line appears in diff and claim is concrete', () => {
const findings = [{
file: 'src/foo.js',
line: 1,
quoted_line: '+const id = req.body.userId;',
claim: 'src/foo.js:1 trusts req.body.userId without validating it is a string.'
}];
expect(filterFindings(findings, diff)).toHaveLength(1);
});

it('drops findings whose quoted_line is not in the diff (paraphrase / hallucination)', () => {
const findings = [{
file: 'src/foo.js',
line: 99,
quoted_line: '+const userId = request.body.user_id;', // close but not in diff
claim: 'unvalidated input'
}];
expect(filterFindings(findings, diff)).toHaveLength(0);
});

it('drops findings with hedging language', () => {
const findings = [{
file: 'src/foo.js',
line: 1,
quoted_line: '+const id = req.body.userId;',
claim: 'this might be an issue, consider validating it'
}];
expect(filterFindings(findings, diff)).toHaveLength(0);
});

it('drops findings with classic slop phrases', () => {
const findings = [{
file: 'src/foo.js',
line: 1,
quoted_line: '+const id = req.body.userId;',
claim: 'developer should ensure proper error handling around this line'
}];
expect(filterFindings(findings, diff)).toHaveLength(0);
});

it('returns [] for non-array input', () => {
expect(filterFindings(null, diff)).toEqual([]);
expect(filterFindings(undefined, diff)).toEqual([]);
});

it('skips quote check when diff is empty (test-time helper)', () => {
const findings = [{
file: 'a', line: 1, quoted_line: 'whatever', claim: 'no slop here at line 1'
}];
expect(filterFindings(findings, '')).toHaveLength(1);
});
});

describe('computeVerdict (deterministic, ignores model)', () => {
it('returns approve for empty findings', () => {
expect(computeVerdict([])).toBe('approve');
expect(computeVerdict(undefined)).toBe('approve');
expect(computeVerdict(null)).toBe('approve');
});

it('returns comment when findings exist', () => {
expect(computeVerdict([{ file: 'a', line: 1, quoted_line: 'x', claim: 'y' }])).toBe('comment');
});
});

describe('renderReviewMarkdown', () => {
const meta = {
baseRepo: 'org/repo', baseBranch: 'main',
headRepo: 'org/repo', headBranch: 'fix/x'
};

it('returns null when verdict is approve and findings empty (silence preferred)', () => {
const out = renderReviewMarkdown(
{ verdict: 'approve', summary: 'Doc only.', findings: [] },
42, 'abc1234', meta
);
expect(out).toBeNull();
});

it('renders verdict + summary + findings when findings exist', () => {
const out = renderReviewMarkdown(
{
verdict: 'comment',
summary: 'Adds X.',
findings: [{ file: 'a.js', line: 5, quoted_line: '+x', claim: 'no test for x at a.js:5' }]
},
42, 'abc1234', meta
);
expect(out).toContain('AI Code Review for PR #42');
expect(out).toContain('💬 Comment');
expect(out).toContain('Adds X.');
expect(out).toContain('`a.js:5`');
expect(out).toContain('no test for x');
expect(out).toContain('abc1234');
});

it('forces verdict from findings even if model claimed otherwise', () => {
// Model says approve but provides findings — verdict must be comment.
const out = renderReviewMarkdown(
{
verdict: 'approve', // ← model lied
summary: 'Adds X.',
findings: [{ file: 'a.js', line: 5, quoted_line: '+x', claim: 'no test for x at a.js:5' }]
},
42, 'abc1234', meta
);
expect(out).toContain('💬 Comment');
expect(out).not.toContain('✅ Approve');
});

it('still posts when caller opts out of skip-approve-empty', () => {
const out = renderReviewMarkdown(
{ verdict: 'approve', summary: 'Doc only.', findings: [] },
42, 'abc1234', meta,
{ skipApproveEmpty: false }
);
expect(out).toContain('✅ Approve');
expect(out).toContain('**Findings:** None.');
});
});
Loading
Loading