From 51da648ffb75c8b951f263fe1e9bd0efaa906cbe Mon Sep 17 00:00:00 2001
From: Ralf Anton Beier <ralf_beier@me.com>
Date: Mon, 27 Apr 2026 06:12:03 +0200
Subject: [PATCH] feat: grammar-constrained AI review output (Ollama JSON
 schema mode)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Why
Today's live test revealed the small model's failure mode: it can produce
valid JSON of the *wrong shape*. PR #222 got `{"commit_message":"Add
pre-commit configuration files"}` instead of `{verdict, summary,
findings}`. Strict parser correctly rejected; bot fell to silence.

The strict prompt teaches the model the *intent*. The schema enforces the
*shape*. Belt-and-suspenders.

## What
New exports in `src/ai-review-prompt.js`:

- `REVIEW_JSON_SCHEMA` — full JSON Schema for the review payload, with
  `verdict` enum, required fields, and `additionalProperties: false`.
- `buildResponseFormat()` — wraps the schema in OpenAI-compat
  `response_format: { type: 'json_schema', json_schema: { name, strict,
  schema } }`. Ollama (≥0.5) honours this on `/v1/chat/completions` for
  grammar-constrained generation — the decoder physically cannot emit
  tokens that violate the schema.

`src/ai-review.js`:
- `callLocalAI` accepts a new `responseFormat` option. When set, it goes
  on the wire as `response_format`. Older Ollama versions ignore unknown
  fields, so the bot degrades gracefully to strict-prompt + parser path.
- Strict mode passes `buildResponseFormat()` automatically. Legacy
  freeform path (when `system_prompt` is overridden) does not — that path
  expects prose.

## Why this is a bigger deal than it looks
The Netcup VPS is RAM-bound: 3.8 GB total, no swap. Anything bigger than
qwen2.5-coder:3b at Q4 risks OOM on inference. So we cannot just upgrade
the model. Constrained decoding lets the existing 3 B model succeed on
the shape problem the prompt couldn't fix on its own.

## Test plan
- [x] All 773 tests pass (was 766 — added 7 covering schema shape, response
      format builder, wire-payload pass-through, legacy-mode omission)
- [x] eslint clean
- [ ] After deploy: trigger `/review-pr` on a recent PR. Wire-level: the
      Ollama request body should contain `response_format` field. Behaviour:
      no more `"AI review JSON parse failed: invalid verdict: undefined"`
      log lines (the previous failure was wrong-shape JSON; this kills it).

## Risk & rollout
- Risk: low. New field is opt-in via the `responseFormat` argument; absent
  → wire payload unchanged from current behaviour. Old Ollama: silently
  ignores; falls through to existing parser.
- Rollout: self-update on merge.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 __tests__/integration/ai-review.test.js | 19 ++++++++
 __tests__/unit/ai-review-prompt.test.js | 36 +++++++++++++++
 src/ai-review-prompt.js                 | 59 +++++++++++++++++++++++++
 src/ai-review.js                        | 43 ++++++++++++------
 4 files changed, 144 insertions(+), 13 deletions(-)

diff --git a/__tests__/integration/ai-review.test.js b/__tests__/integration/ai-review.test.js
index 47608a7..c792031 100644
--- a/__tests__/integration/ai-review.test.js
+++ b/__tests__/integration/ai-review.test.js
@@ -629,6 +629,25 @@ describe('ai-review', () => {
       const callArgs = global.fetch.mock.calls[0];
       expect(callArgs[1].signal).toBeInstanceOf(AbortSignal);
     });
+
+    it('passes responseFormat through to the wire payload as response_format', async () => {
+      mockFetchSuccess('{"verdict":"approve","summary":"x","findings":[]}');
+      const fmt = { type: 'json_schema', json_schema: { name: 'X', strict: true, schema: { type: 'object' } } };
+
+      await callLocalAI(mockEndpoint, mockModel, mockSystemPrompt, mockUserPrompt, {
+        responseFormat: fmt
+      });
+
+      const body = JSON.parse(global.fetch.mock.calls[0][1].body);
+      expect(body.response_format).toEqual(fmt);
+    });
+
+    it('omits response_format when not requested (legacy compat)', async () => {
+      mockFetchSuccess('plain prose');
+      await callLocalAI(mockEndpoint, mockModel, mockSystemPrompt, mockUserPrompt);
+      const body = JSON.parse(global.fetch.mock.calls[0][1].body);
+      expect(body.response_format).toBeUndefined();
+    });
   });
 
   // =========================================================================
diff --git a/__tests__/unit/ai-review-prompt.test.js b/__tests__/unit/ai-review-prompt.test.js
index 4502880..727c338 100644
--- a/__tests__/unit/ai-review-prompt.test.js
+++ b/__tests__/unit/ai-review-prompt.test.js
@@ -1,5 +1,7 @@
 import {
   STRICT_SYSTEM_PROMPT,
+  REVIEW_JSON_SCHEMA,
+  buildResponseFormat,
   isSlop,
   tryParseReview,
   filterFindings,
@@ -16,6 +18,40 @@ describe('STRICT_SYSTEM_PROMPT', () => {
   });
 });
 
+describe('REVIEW_JSON_SCHEMA', () => {
+  it('declares the verdict enum exactly matching the parser', () => {
+    expect(REVIEW_JSON_SCHEMA.properties.verdict.enum).toEqual([
+      'approve', 'comment', 'request_changes'
+    ]);
+  });
+
+  it('marks verdict, summary, findings as required', () => {
+    expect(REVIEW_JSON_SCHEMA.required.sort()).toEqual(
+      ['findings', 'summary', 'verdict']
+    );
+  });
+
+  it('rejects extra properties (additionalProperties: false)', () => {
+    expect(REVIEW_JSON_SCHEMA.additionalProperties).toBe(false);
+    expect(REVIEW_JSON_SCHEMA.properties.findings.items.additionalProperties).toBe(false);
+  });
+
+  it('every finding field used by the parser is declared in the schema', () => {
+    const fields = Object.keys(REVIEW_JSON_SCHEMA.properties.findings.items.properties).sort();
+    expect(fields).toEqual(['claim', 'file', 'line', 'quoted_line']);
+  });
+});
+
+describe('buildResponseFormat', () => {
+  it('returns OpenAI-compat json_schema payload', () => {
+    const rf = buildResponseFormat();
+    expect(rf.type).toBe('json_schema');
+    expect(rf.json_schema.name).toBe('PullRequestReview');
+    expect(rf.json_schema.strict).toBe(true);
+    expect(rf.json_schema.schema).toBe(REVIEW_JSON_SCHEMA);
+  });
+});
+
 describe('isSlop', () => {
   it.each([
     'consider edge cases here',
diff --git a/src/ai-review-prompt.js b/src/ai-review-prompt.js
index 9963ba5..b5d5807 100644
--- a/src/ai-review-prompt.js
+++ b/src/ai-review-prompt.js
@@ -50,6 +50,65 @@ Example with a finding:
 Example with no findings:
 {"verdict":"approve","summary":"Pure docs change in README.","findings":[]}`;
 
+/**
+ * JSON Schema describing the strict review payload. Used as a wire-format
+ * contract: passed to Ollama via OpenAI-compat `response_format` so the
+ * decoder is grammar-constrained at generation time. The model literally
+ * cannot emit a token sequence that violates the schema.
+ *
+ * Belt-and-suspenders relative to the prompt rules above: the prompt
+ * teaches the *intent* (banned hedging, quote-or-die etc.) while the schema
+ * enforces the *shape*. Today's wrong-shape failures (model emitting
+ * `{"commit_message":"..."}` instead of the review object) become
+ * impossible.
+ */
+export const REVIEW_JSON_SCHEMA = {
+  type: 'object',
+  additionalProperties: false,
+  properties: {
+    verdict: {
+      type: 'string',
+      enum: ['approve', 'comment', 'request_changes']
+    },
+    summary: {
+      type: 'string',
+      maxLength: 400
+    },
+    findings: {
+      type: 'array',
+      items: {
+        type: 'object',
+        additionalProperties: false,
+        properties: {
+          file: { type: 'string', minLength: 1 },
+          line: { type: 'integer', minimum: 1 },
+          quoted_line: { type: 'string', minLength: 1 },
+          claim: { type: 'string', minLength: 5, maxLength: 400 }
+        },
+        required: ['file', 'line', 'quoted_line', 'claim']
+      }
+    }
+  },
+  required: ['verdict', 'summary', 'findings']
+};
+
+/**
+ * Build the OpenAI-compat `response_format` payload for grammar-constrained
+ * JSON output. Ollama (≥0.5) honours this on /v1/chat/completions; older
+ * builds may fall back to plain JSON mode or ignore it entirely (in which
+ * case the parser still catches non-conforming output).
+ */
+export function buildResponseFormat() {
+  return {
+    type: 'json_schema',
+    json_schema: {
+      name: 'PullRequestReview',
+      strict: true,
+      schema: REVIEW_JSON_SCHEMA
+    }
+  };
+}
+
 const HEDGE_OR_SLOP_PATTERNS = [
   /\bmight\b/i,
   /\bcould\b/i,
diff --git a/src/ai-review.js b/src/ai-review.js
index f4ccd0f..9f2d193 100644
--- a/src/ai-review.js
+++ b/src/ai-review.js
@@ -8,7 +8,8 @@ import {
   tryParseReview,
   filterFindings,
   computeVerdict,
-  renderReviewMarkdown
+  renderReviewMarkdown,
+  buildResponseFormat
 } from './ai-review-prompt.js';
 import { runRivetOracle } from './rivet-oracle.js';
 import { hasRivetYaml, withTempRepoCheckout } from './rivet-fetch.js';
@@ -223,25 +224,38 @@ function buildReviewPrompt(prData, diff, files, maxDiffSize) {
 }
 
 async function callLocalAI(endpoint, model, systemPrompt, userPrompt, options = {}) {
-  const { maxTokens = 2000, temperature = 0.3, timeout = 300000 } = options;
+  const {
+    maxTokens = 2000,
+    temperature = 0.3,
+    timeout = 300000,
+    responseFormat = undefined
+  } = options;
 
   const controller = new AbortController();
   const timer = setTimeout(() => controller.abort(), timeout);
 
+  const payload = {
+    model,
+    messages: [
+      { role: 'system', content: systemPrompt },
+      { role: 'user', content: userPrompt }
+    ],
+    max_tokens: maxTokens,
+    temperature,
+    stream: false
+  };
+  if (responseFormat) {
+    // OpenAI-compat: forces grammar-constrained output. Ollama honours
+    // this on /v1/chat/completions in recent builds. Older endpoints that
+    // ignore it fall through to the strict-prompt + parser path.
+    payload.response_format = responseFormat;
+  }
+
   try {
     const response = await fetch(endpoint, {
       method: 'POST',
       headers: { 'Content-Type': 'application/json' },
-      body: JSON.stringify({
-        model,
-        messages: [
-          { role: 'system', content: systemPrompt },
-          { role: 'user', content: userPrompt }
-        ],
-        max_tokens: maxTokens,
-        temperature,
-        stream: false
-      }),
+      body: JSON.stringify(payload),
       signal: controller.signal
     });
 
@@ -438,7 +452,10 @@ async function reviewPullRequest(octokit, owner, repo, prNumber) {
       {
         maxTokens: aiConfig.max_tokens || 2000,
         temperature: aiConfig.temperature || 0.3,
-        timeout: aiConfig.timeout || 120000
+        timeout: aiConfig.timeout || 120000,
+        // Grammar-constrained output in strict mode only — legacy freeform
+        // path expects prose, would get rejected by JSON-schema decoding.
+        responseFormat: useStrictMode ? buildResponseFormat() : undefined
       }
     );