From 51da648ffb75c8b951f263fe1e9bd0efaa906cbe Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Mon, 27 Apr 2026 06:12:03 +0200 Subject: [PATCH] feat: grammar-constrained AI review output (Ollama JSON schema mode) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Why Today's live test revealed the small model's failure mode: it can produce valid JSON of the *wrong shape*. PR #222 got `{"commit_message":"Add pre-commit configuration files"}` instead of `{verdict, summary, findings}`. Strict parser correctly rejected; bot fell to silence. The strict prompt teaches the model the *intent*. The schema enforces the *shape*. Belt-and-suspenders. ## What New exports in `src/ai-review-prompt.js`: - `REVIEW_JSON_SCHEMA` — full JSON Schema for the review payload, with `verdict` enum, required fields, and `additionalProperties: false`. - `buildResponseFormat()` — wraps the schema in OpenAI-compat `response_format: { type: 'json_schema', json_schema: { name, strict, schema } }`. Ollama (≥0.5) honours this on `/v1/chat/completions` for grammar-constrained generation — the decoder physically cannot emit tokens that violate the schema. `src/ai-review.js`: - `callLocalAI` accepts a new `responseFormat` option. When set, it goes on the wire as `response_format`. Older Ollama versions ignore unknown fields, so the bot degrades gracefully to strict-prompt + parser path. - Strict mode passes `buildResponseFormat()` automatically. Legacy freeform path (when `system_prompt` is overridden) does not — that path expects prose. ## Why this is a bigger deal than it looks The Netcup VPS is RAM-bound: 3.8 GB total, no swap. Anything bigger than qwen2.5-coder:3b at Q4 risks OOM on inference. So we cannot just upgrade the model. Constrained decoding lets the existing 3 B model succeed on the shape problem the prompt couldn't fix on its own. ## Test plan - [x] All 773 tests pass (was 766 — added 7 covering schema shape, response format builder, wire-payload pass-through, legacy-mode omission) - [x] eslint clean - [ ] After deploy: trigger `/review-pr` on a recent PR. Wire-level: the Ollama request body should contain `response_format` field. Behaviour: no more `"AI review JSON parse failed: invalid verdict: undefined"` log lines (the previous failure was wrong-shape JSON; this kills it). ## Risk & rollout - Risk: low. New field is opt-in via the `responseFormat` argument; absent → wire payload unchanged from current behaviour. Old Ollama: silently ignores; falls through to existing parser. - Rollout: self-update on merge. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.7 (1M context) --- __tests__/integration/ai-review.test.js | 19 ++++++++ __tests__/unit/ai-review-prompt.test.js | 36 +++++++++++++++ src/ai-review-prompt.js | 59 +++++++++++++++++++++++++ src/ai-review.js | 43 ++++++++++++------ 4 files changed, 144 insertions(+), 13 deletions(-) diff --git a/__tests__/integration/ai-review.test.js b/__tests__/integration/ai-review.test.js index 47608a7..c792031 100644 --- a/__tests__/integration/ai-review.test.js +++ b/__tests__/integration/ai-review.test.js @@ -629,6 +629,25 @@ describe('ai-review', () => { const callArgs = global.fetch.mock.calls[0]; expect(callArgs[1].signal).toBeInstanceOf(AbortSignal); }); + + it('passes responseFormat through to the wire payload as response_format', async () => { + mockFetchSuccess('{"verdict":"approve","summary":"x","findings":[]}'); + const fmt = { type: 'json_schema', json_schema: { name: 'X', strict: true, schema: { type: 'object' } } }; + + await callLocalAI(mockEndpoint, mockModel, mockSystemPrompt, mockUserPrompt, { + responseFormat: fmt + }); + + const body = JSON.parse(global.fetch.mock.calls[0][1].body); + expect(body.response_format).toEqual(fmt); + }); + + it('omits response_format when not requested (legacy compat)', async () => { + mockFetchSuccess('plain prose'); + await callLocalAI(mockEndpoint, mockModel, mockSystemPrompt, mockUserPrompt); + const body = JSON.parse(global.fetch.mock.calls[0][1].body); + expect(body.response_format).toBeUndefined(); + }); }); // ========================================================================= diff --git a/__tests__/unit/ai-review-prompt.test.js b/__tests__/unit/ai-review-prompt.test.js index 4502880..727c338 100644 --- a/__tests__/unit/ai-review-prompt.test.js +++ b/__tests__/unit/ai-review-prompt.test.js @@ -1,5 +1,7 @@ import { STRICT_SYSTEM_PROMPT, + REVIEW_JSON_SCHEMA, + buildResponseFormat, isSlop, tryParseReview, filterFindings, @@ -16,6 +18,40 @@ describe('STRICT_SYSTEM_PROMPT', () => { }); }); +describe('REVIEW_JSON_SCHEMA', () => { + it('declares the verdict enum exactly matching the parser', () => { + expect(REVIEW_JSON_SCHEMA.properties.verdict.enum).toEqual([ + 'approve', 'comment', 'request_changes' + ]); + }); + + it('marks verdict, summary, findings as required', () => { + expect(REVIEW_JSON_SCHEMA.required.sort()).toEqual( + ['findings', 'summary', 'verdict'] + ); + }); + + it('rejects extra properties (additionalProperties: false)', () => { + expect(REVIEW_JSON_SCHEMA.additionalProperties).toBe(false); + expect(REVIEW_JSON_SCHEMA.properties.findings.items.additionalProperties).toBe(false); + }); + + it('every finding field used by the parser is declared in the schema', () => { + const fields = Object.keys(REVIEW_JSON_SCHEMA.properties.findings.items.properties).sort(); + expect(fields).toEqual(['claim', 'file', 'line', 'quoted_line']); + }); +}); + +describe('buildResponseFormat', () => { + it('returns OpenAI-compat json_schema payload', () => { + const rf = buildResponseFormat(); + expect(rf.type).toBe('json_schema'); + expect(rf.json_schema.name).toBe('PullRequestReview'); + expect(rf.json_schema.strict).toBe(true); + expect(rf.json_schema.schema).toBe(REVIEW_JSON_SCHEMA); + }); +}); + describe('isSlop', () => { it.each([ 'consider edge cases here', diff --git a/src/ai-review-prompt.js b/src/ai-review-prompt.js index 9963ba5..b5d5807 100644 --- a/src/ai-review-prompt.js +++ b/src/ai-review-prompt.js @@ -50,6 +50,65 @@ Example with a finding: Example with no findings: {"verdict":"approve","summary":"Pure docs change in README.","findings":[]}`; +/** + * JSON Schema describing the strict review payload. Used as a wire-format + * contract: passed to Ollama via OpenAI-compat `response_format` so the + * decoder is grammar-constrained at generation time. The model literally + * cannot emit a token sequence that violates the schema. + * + * Belt-and-suspenders relative to the prompt rules above: the prompt + * teaches the *intent* (banned hedging, quote-or-die etc.) while the schema + * enforces the *shape*. Today's wrong-shape failures (model emitting + * `{"commit_message":"..."}` instead of the review object) become + * impossible. + */ +export const REVIEW_JSON_SCHEMA = { + type: 'object', + additionalProperties: false, + properties: { + verdict: { + type: 'string', + enum: ['approve', 'comment', 'request_changes'] + }, + summary: { + type: 'string', + maxLength: 400 + }, + findings: { + type: 'array', + items: { + type: 'object', + additionalProperties: false, + properties: { + file: { type: 'string', minLength: 1 }, + line: { type: 'integer', minimum: 1 }, + quoted_line: { type: 'string', minLength: 1 }, + claim: { type: 'string', minLength: 5, maxLength: 400 } + }, + required: ['file', 'line', 'quoted_line', 'claim'] + } + } + }, + required: ['verdict', 'summary', 'findings'] +}; + +/** + * Build the OpenAI-compat `response_format` payload for grammar-constrained + * JSON output. Ollama (≥0.5) honours this on /v1/chat/completions; older + * builds may fall back to plain JSON mode or ignore it entirely (in which + * case the parser still catches non-conforming output). + */ +export function buildResponseFormat() { + return { + type: 'json_schema', + json_schema: { + name: 'PullRequestReview', + strict: true, + schema: REVIEW_JSON_SCHEMA + } + }; +} + const HEDGE_OR_SLOP_PATTERNS = [ /\bmight\b/i, /\bcould\b/i, diff --git a/src/ai-review.js b/src/ai-review.js index f4ccd0f..9f2d193 100644 --- a/src/ai-review.js +++ b/src/ai-review.js @@ -8,7 +8,8 @@ import { tryParseReview, filterFindings, computeVerdict, - renderReviewMarkdown + renderReviewMarkdown, + buildResponseFormat } from './ai-review-prompt.js'; import { runRivetOracle } from './rivet-oracle.js'; import { hasRivetYaml, withTempRepoCheckout } from './rivet-fetch.js'; @@ -223,25 +224,38 @@ function buildReviewPrompt(prData, diff, files, maxDiffSize) { } async function callLocalAI(endpoint, model, systemPrompt, userPrompt, options = {}) { - const { maxTokens = 2000, temperature = 0.3, timeout = 300000 } = options; + const { + maxTokens = 2000, + temperature = 0.3, + timeout = 300000, + responseFormat = undefined + } = options; const controller = new AbortController(); const timer = setTimeout(() => controller.abort(), timeout); + const payload = { + model, + messages: [ + { role: 'system', content: systemPrompt }, + { role: 'user', content: userPrompt } + ], + max_tokens: maxTokens, + temperature, + stream: false + }; + if (responseFormat) { + // OpenAI-compat: forces grammar-constrained output. Ollama honours + // this on /v1/chat/completions in recent builds. Older endpoints that + // ignore it fall through to the strict-prompt + parser path. + payload.response_format = responseFormat; + } + try { const response = await fetch(endpoint, { method: 'POST', headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ - model, - messages: [ - { role: 'system', content: systemPrompt }, - { role: 'user', content: userPrompt } - ], - max_tokens: maxTokens, - temperature, - stream: false - }), + body: JSON.stringify(payload), signal: controller.signal }); @@ -438,7 +452,10 @@ async function reviewPullRequest(octokit, owner, repo, prNumber) { { maxTokens: aiConfig.max_tokens || 2000, temperature: aiConfig.temperature || 0.3, - timeout: aiConfig.timeout || 120000 + timeout: aiConfig.timeout || 120000, + // Grammar-constrained output in strict mode only — legacy freeform + // path expects prose, would get rejected by JSON-schema decoding. + responseFormat: useStrictMode ? buildResponseFormat() : undefined } );