From 107fca76d269457e565dad158bceeeb59d9ddeb8 Mon Sep 17 00:00:00 2001
From: Hiten Shah <hnshah@gmail.com>
Date: Sat, 2 May 2026 12:42:38 -0700
Subject: [PATCH 1/2] fix(ce-code-review): keep finding numbers stable

---
 .../skills/ce-code-review/SKILL.md               |  5 +++--
 .../references/review-output-template.md         |  1 +
 tests/review-skill-contract.test.ts              | 16 ++++++++++++++++
 3 files changed, 20 insertions(+), 2 deletions(-)
diff --git a/plugins/compound-engineering/skills/ce-code-review/SKILL.md b/plugins/compound-engineering/skills/ce-code-review/SKILL.md
index 102a458f8..52a2ef577 100644
--- a/plugins/compound-engineering/skills/ce-code-review/SKILL.md
+++ b/plugins/compound-engineering/skills/ce-code-review/SKILL.md
@@ -67,7 +67,7 @@ Sequence:
 - **Skip all user questions.** Never pause for approval or clarification once scope has been established.
 - **Apply only `safe_auto -> review-fixer` findings.** Leave `gated_auto`, `manual`, `human`, and `release` work unresolved.
 - **Write a run artifact** under `/tmp/compound-engineering/ce-code-review/<run-id>/` summarizing findings, applied fixes, residual actionable work, and advisory outputs. Orchestrators read this artifact to route residual `downstream-resolver` findings; the skill itself does not file tickets or prompt the user in autofix.
-- **Emit a compact Residual Actionable Work summary in the autofix return** listing each residual `downstream-resolver` finding with severity, file:line, title, and autofix_class. Include the run-artifact path. Callers read this summary directly without parsing the artifact. When no residuals exist, state `Residual actionable work: none.` explicitly.
+- **Emit a compact Residual Actionable Work summary in the autofix return** listing each residual `downstream-resolver` finding with its stable `#`, severity, file:line, title, and autofix_class. Keep applied `safe_auto` fixes and residual non-auto findings in separate contiguous sections, and reuse the stable finding numbers from Stage 5 rather than renumbering residuals. Include the run-artifact path. Callers read this summary directly without parsing the artifact. When no residuals exist, state `Residual actionable work: none.` explicitly.
 - **Never commit, push, or create a PR** from autofix mode. Parent workflows own those decisions.
 
 ### Report-only mode rules
@@ -551,6 +551,7 @@ Demotion is intentionally narrow. The conservative scope (testing/maintainabilit
    - residual actionable queue: unresolved `gated_auto` or `manual` findings whose owner is `downstream-resolver`
    - report-only queue: `advisory` findings plus anything owned by `human` or `release`
 9. **Sort.** Order by severity (P0 first) -> anchor (descending) -> file path -> line number.
+9b. **Assign stable finding numbers once.** After sorting, assign monotonically increasing `#` values across the full primary finding set. Do not restart numbering inside each severity table or autofix/routing bucket. If later sections repeat a finding (for example Residual Actionable Work after `safe_auto` fixes are applied), reuse the same stable `#` so users can copy the non-auto-fixable work without remapping IDs.
 10. **Collect coverage data.** Union residual_risks and testing_gaps across reviewers.
 11. **Preserve CE agent artifacts.** Keep the learnings, agent-native, schema-drift, and deployment-verification outputs alongside the merged finding set. Do not drop unstructured agent output just because it does not match the persona JSON schema.
 
@@ -600,7 +601,7 @@ When Stage 5b does not run, the merged finding set from Stage 5 flows through to
 Assemble the final report using **pipe-delimited markdown tables for findings** from the review output template included below. The table format is mandatory for finding rows in interactive mode — do not render findings as freeform text blocks or horizontal-rule-separated prose. Other report sections (Applied Fixes, Learnings, Coverage, etc.) use bullet lists and the `---` separator before the verdict, as shown in the template.
 
 1. **Header.** Scope, intent, mode, reviewer team with per-conditional justifications.
-2. **Findings.** Rendered as pipe-delimited tables grouped by severity (`### P0 -- Critical`, `### P1 -- High`, `### P2 -- Moderate`, `### P3 -- Low`). Each finding row shows `#`, file, issue, reviewer(s), confidence, and synthesized route. Omit empty severity levels. Never render findings as freeform text blocks or numbered lists.
+2. **Findings.** Rendered as pipe-delimited tables grouped by severity (`### P0 -- Critical`, `### P1 -- High`, `### P2 -- Moderate`, `### P3 -- Low`). Each finding row shows `#`, file, issue, reviewer(s), confidence, and synthesized route. Omit empty severity levels. Never render findings as freeform text blocks or numbered lists. Finding numbers must come from Stage 5's stable numbering and continue sequentially across severity sections; never restart at `1` for a new severity header.
 3. **Requirements Completeness.** Include only when a plan was found in Stage 2b. For each requirement (R1, R2, etc.) and implementation unit in the plan, report whether corresponding work appears in the diff. Use a simple checklist: met / not addressed / partially addressed. Routing depends on `plan_source`:
    - **`explicit`** (caller-provided or PR body): Flag unaddressed requirements as P1 findings with `autofix_class: manual`, `owner: downstream-resolver`. These enter the residual actionable queue.
    - **`inferred`** (auto-discovered): Flag unaddressed requirements as P3 findings with `autofix_class: advisory`, `owner: human`. These stay in the report only — no autonomous follow-up. An inferred plan match is a hint, not a contract.
diff --git a/plugins/compound-engineering/skills/ce-code-review/references/review-output-template.md b/plugins/compound-engineering/skills/ce-code-review/references/review-output-template.md
index c0d7a8a09..5dac5c0c4 100644
--- a/plugins/compound-engineering/skills/ce-code-review/references/review-output-template.md
+++ b/plugins/compound-engineering/skills/ce-code-review/references/review-output-template.md
@@ -117,6 +117,7 @@ This fails because: no pipe-delimited tables, no severity-grouped `###` headers,
 
 - **Pipe-delimited markdown tables** for findings -- never ASCII box-drawing characters or per-finding horizontal-rule separators between entries (the report-level `---` before the verdict is still required)
 - **Severity-grouped sections** -- `### P0 -- Critical`, `### P1 -- High`, `### P2 -- Moderate`, `### P3 -- Low`. Omit empty severity levels.
+- **Stable sequential finding numbers** -- assign finding numbers once after sorting, continue them across severity sections, and reuse those same numbers when findings are repeated in Residual Actionable Work. Do not restart at `1` for each severity or route bucket.
 - **Always include file:line location** for code review issues
 - **Reviewer column** shows which persona(s) flagged the issue. Multiple reviewers = cross-reviewer agreement.
 - **Confidence column** shows the finding's anchor as an integer (`50`, `75`, or `100`). Never render as a float.
diff --git a/tests/review-skill-contract.test.ts b/tests/review-skill-contract.test.ts
index 5a720d0a1..4f098b80e 100644
--- a/tests/review-skill-contract.test.ts
+++ b/tests/review-skill-contract.test.ts
@@ -731,8 +731,24 @@ describe("ce-code-review contract", () => {
   test("ce-code-review autofix emits a residual-work summary in-chat, not only in the artifact", async () => {
     const content = await readRepoFile("plugins/compound-engineering/skills/ce-code-review/SKILL.md")
     expect(content).toMatch(/Emit a compact Residual Actionable Work summary/)
+    expect(content).toContain("with its stable `#`, severity, file:line, title, and autofix_class")
+    expect(content).toContain("Keep applied `safe_auto` fixes and residual non-auto findings in separate contiguous sections")
     expect(content).toContain("Residual actionable work: none.")
   })
+
+  test("ce-code-review uses stable sequential finding numbers across grouped output", async () => {
+    const content = await readRepoFile("plugins/compound-engineering/skills/ce-code-review/SKILL.md")
+    const template = await readRepoFile(
+      "plugins/compound-engineering/skills/ce-code-review/references/review-output-template.md",
+    )
+
+    expect(content).toContain("Assign stable finding numbers once")
+    expect(content).toContain("Do not restart numbering inside each severity table or autofix/routing bucket")
+    expect(content).toContain("reuse the same stable `#`")
+    expect(content).toContain("never restart at `1` for a new severity header")
+    expect(template).toContain("Stable sequential finding numbers")
+    expect(template).toContain("reuse those same numbers when findings are repeated in Residual Actionable Work")
+  })
 })
 
 describe("testing-reviewer contract", () => {

From b6e0251d75e1862786f9068300b7112426282704 Mon Sep 17 00:00:00 2001
From: Hiten Shah <hnshah@gmail.com>
Date: Sun, 3 May 2026 08:49:08 -0700
Subject: [PATCH 2/2] fix: address stable finding number review feedback

---
 .../skills/ce-code-review/SKILL.md            |  7 ++--
 .../references/review-output-template.md      |  2 +-
 .../ce-code-review-stable-numbering.md        | 31 ++++++++++++++++++
 tests/review-skill-contract.test.ts           | 32 ++++++++++++++++---
 4 files changed, 62 insertions(+), 10 deletions(-)
 create mode 100644 tests/fixtures/ce-code-review-stable-numbering.md

diff --git a/plugins/compound-engineering/skills/ce-code-review/SKILL.md b/plugins/compound-engineering/skills/ce-code-review/SKILL.md
index 52a2ef577..97edcc9fe 100644
--- a/plugins/compound-engineering/skills/ce-code-review/SKILL.md
+++ b/plugins/compound-engineering/skills/ce-code-review/SKILL.md
@@ -67,7 +67,7 @@ Sequence:
 - **Skip all user questions.** Never pause for approval or clarification once scope has been established.
 - **Apply only `safe_auto -> review-fixer` findings.** Leave `gated_auto`, `manual`, `human`, and `release` work unresolved.
 - **Write a run artifact** under `/tmp/compound-engineering/ce-code-review/<run-id>/` summarizing findings, applied fixes, residual actionable work, and advisory outputs. Orchestrators read this artifact to route residual `downstream-resolver` findings; the skill itself does not file tickets or prompt the user in autofix.
-- **Emit a compact Residual Actionable Work summary in the autofix return** listing each residual `downstream-resolver` finding with its stable `#`, severity, file:line, title, and autofix_class. Keep applied `safe_auto` fixes and residual non-auto findings in separate contiguous sections, and reuse the stable finding numbers from Stage 5 rather than renumbering residuals. Include the run-artifact path. Callers read this summary directly without parsing the artifact. When no residuals exist, state `Residual actionable work: none.` explicitly.
+- **Emit a compact Residual Actionable Work summary in the autofix return** listing each residual `downstream-resolver` finding with its stable `#`, severity, file:line, title, and autofix_class. Structure the summary as two separate contiguous sections: applied `safe_auto` fixes first, then residual non-auto findings. Within the residual section, reuse each finding's stable `#` from Stage 5 -- never renumber. Include the run-artifact path. Callers read this summary directly without parsing the artifact. When no residuals exist, state `Residual actionable work: none.` explicitly.
 - **Never commit, push, or create a PR** from autofix mode. Parent workflows own those decisions.
 
 ### Report-only mode rules
@@ -550,8 +550,7 @@ Demotion is intentionally narrow. The conservative scope (testing/maintainabilit
    - in-skill fixer queue: only `safe_auto -> review-fixer`
    - residual actionable queue: unresolved `gated_auto` or `manual` findings whose owner is `downstream-resolver`
    - report-only queue: `advisory` findings plus anything owned by `human` or `release`
-9. **Sort.** Order by severity (P0 first) -> anchor (descending) -> file path -> line number.
-9b. **Assign stable finding numbers once.** After sorting, assign monotonically increasing `#` values across the full primary finding set. Do not restart numbering inside each severity table or autofix/routing bucket. If later sections repeat a finding (for example Residual Actionable Work after `safe_auto` fixes are applied), reuse the same stable `#` so users can copy the non-auto-fixable work without remapping IDs.
+9. **Sort and number.** Order by severity (P0 first) -> anchor (descending) -> file path -> line number, then assign monotonically increasing `#` values across the full primary finding set in that sorted order. Do not restart numbering inside each severity table or autofix/routing bucket. If later sections repeat a finding (for example Residual Actionable Work after `safe_auto` fixes are applied), reuse the same stable `#` so users -- and downstream skills like `ce-resolve-pr-feedback` -- can reference findings by `#` after the autofix loop rewrites the report. Renumbering after autofix invalidates any prior reference: copied snippets, follow-up prompts citing `#3`, or tickets filed against an earlier render.
 10. **Collect coverage data.** Union residual_risks and testing_gaps across reviewers.
 11. **Preserve CE agent artifacts.** Keep the learnings, agent-native, schema-drift, and deployment-verification outputs alongside the merged finding set. Do not drop unstructured agent output just because it does not match the persona JSON schema.
 
@@ -601,7 +600,7 @@ When Stage 5b does not run, the merged finding set from Stage 5 flows through to
 Assemble the final report using **pipe-delimited markdown tables for findings** from the review output template included below. The table format is mandatory for finding rows in interactive mode — do not render findings as freeform text blocks or horizontal-rule-separated prose. Other report sections (Applied Fixes, Learnings, Coverage, etc.) use bullet lists and the `---` separator before the verdict, as shown in the template.
 
 1. **Header.** Scope, intent, mode, reviewer team with per-conditional justifications.
-2. **Findings.** Rendered as pipe-delimited tables grouped by severity (`### P0 -- Critical`, `### P1 -- High`, `### P2 -- Moderate`, `### P3 -- Low`). Each finding row shows `#`, file, issue, reviewer(s), confidence, and synthesized route. Omit empty severity levels. Never render findings as freeform text blocks or numbered lists. Finding numbers must come from Stage 5's stable numbering and continue sequentially across severity sections; never restart at `1` for a new severity header.
+2. **Findings.** Rendered as pipe-delimited tables grouped by severity (`### P0 -- Critical`, `### P1 -- High`, `### P2 -- Moderate`, `### P3 -- Low`). Each finding row shows `#`, file, issue, reviewer(s), confidence, and synthesized route. Omit empty severity levels. Never render findings as freeform text blocks or numbered lists. Finding numbers come from the stable assignment in Stage 5 -- never re-derive them per severity table.
 3. **Requirements Completeness.** Include only when a plan was found in Stage 2b. For each requirement (R1, R2, etc.) and implementation unit in the plan, report whether corresponding work appears in the diff. Use a simple checklist: met / not addressed / partially addressed. Routing depends on `plan_source`:
    - **`explicit`** (caller-provided or PR body): Flag unaddressed requirements as P1 findings with `autofix_class: manual`, `owner: downstream-resolver`. These enter the residual actionable queue.
    - **`inferred`** (auto-discovered): Flag unaddressed requirements as P3 findings with `autofix_class: advisory`, `owner: human`. These stay in the report only — no autonomous follow-up. An inferred plan match is a hint, not a contract.
diff --git a/plugins/compound-engineering/skills/ce-code-review/references/review-output-template.md b/plugins/compound-engineering/skills/ce-code-review/references/review-output-template.md
index 5dac5c0c4..7ea6dd652 100644
--- a/plugins/compound-engineering/skills/ce-code-review/references/review-output-template.md
+++ b/plugins/compound-engineering/skills/ce-code-review/references/review-output-template.md
@@ -51,7 +51,7 @@ Use this **exact format** when presenting synthesized review findings. Findings
 | # | File | Issue | Route | Next Step |
 |---|------|-------|-------|-----------|
 | 1 | `orders_controller.rb:42` | Ownership check missing on export lookup | `gated_auto -> downstream-resolver` | Defer via tracker (requires explicit approval before behavior change) |
-| 2 | `export_service.rb:91` | Pagination contract needs a broader API decision | `manual -> downstream-resolver` | Defer via tracker with contract and client impact details |
+| 3 | `export_service.rb:91` | Pagination contract needs a broader API decision | `manual -> downstream-resolver` | Defer via tracker with contract and client impact details |
 
 ### Pre-existing Issues
 
diff --git a/tests/fixtures/ce-code-review-stable-numbering.md b/tests/fixtures/ce-code-review-stable-numbering.md
new file mode 100644
index 000000000..710f8b59a
--- /dev/null
+++ b/tests/fixtures/ce-code-review-stable-numbering.md
@@ -0,0 +1,31 @@
+## Code Review Results
+
+**Scope:** merge-base with main -> working tree
+**Intent:** Demonstrate stable finding numbering
+**Mode:** autofix
+
+**Reviewers:** correctness, testing, maintainability
+
+### P1 -- High
+
+| # | File | Issue | Reviewer | Confidence | Route |
+|---|------|-------|----------|------------|-------|
+| 1 | `export_service.rb:87` | Loads all orders into memory | performance | 100 | `safe_auto -> review-fixer` |
+| 2 | `export_service.rb:91` | Missing pagination contract | api-contract | 75 | `manual -> downstream-resolver` |
+
+### P2 -- Moderate
+
+| # | File | Issue | Reviewer | Confidence | Route |
+|---|------|-------|----------|------------|-------|
+| 3 | `export_service.rb:45` | Missing error handling | correctness | 75 | `gated_auto -> downstream-resolver` |
+
+### Applied Fixes
+
+- `safe_auto`: Applied bounded export loading fix for #1.
+
+### Residual Actionable Work
+
+| # | File | Issue | Route | Next Step |
+|---|------|-------|-------|-----------|
+| 2 | `export_service.rb:91` | Missing pagination contract | `manual -> downstream-resolver` | Defer via tracker with API contract context |
+| 3 | `export_service.rb:45` | Missing error handling | `gated_auto -> downstream-resolver` | Defer via tracker pending behavior approval |
diff --git a/tests/review-skill-contract.test.ts b/tests/review-skill-contract.test.ts
index 4f098b80e..a338ff4bd 100644
--- a/tests/review-skill-contract.test.ts
+++ b/tests/review-skill-contract.test.ts
@@ -732,7 +732,9 @@ describe("ce-code-review contract", () => {
     const content = await readRepoFile("plugins/compound-engineering/skills/ce-code-review/SKILL.md")
     expect(content).toMatch(/Emit a compact Residual Actionable Work summary/)
     expect(content).toContain("with its stable `#`, severity, file:line, title, and autofix_class")
-    expect(content).toContain("Keep applied `safe_auto` fixes and residual non-auto findings in separate contiguous sections")
+    expect(content).toContain("Structure the summary as two separate contiguous sections")
+    expect(content).toContain("applied `safe_auto` fixes first, then residual non-auto findings")
+    expect(content).toContain("reuse each finding's stable `#` from Stage 5 -- never renumber")
     expect(content).toContain("Residual actionable work: none.")
   })
 
@@ -741,13 +743,33 @@ describe("ce-code-review contract", () => {
     const template = await readRepoFile(
       "plugins/compound-engineering/skills/ce-code-review/references/review-output-template.md",
     )
+    const fixture = await readRepoFile("tests/fixtures/ce-code-review-stable-numbering.md")
 
-    expect(content).toContain("Assign stable finding numbers once")
-    expect(content).toContain("Do not restart numbering inside each severity table or autofix/routing bucket")
-    expect(content).toContain("reuse the same stable `#`")
-    expect(content).toContain("never restart at `1` for a new severity header")
+    const stage5 = content.split("### Stage 5b:")[0].split("### Stage 5:")[1]
+    expect(stage5).toMatch(/Sort and number/)
+    expect(stage5).toMatch(/Do not restart numbering inside each severity table or autofix\/routing bucket/)
+    expect(stage5).toMatch(/reuse the same stable `#`/)
+    expect(stage5).toMatch(/ce-resolve-pr-feedback/)
+
+    const stage6 = content.split("### Headless output format")[0].split("### Stage 6: Synthesize and present")[1]
+    expect(stage6).toContain("Finding numbers come from the stable assignment in Stage 5")
+    expect(stage6).toContain("never re-derive them per severity table")
     expect(template).toContain("Stable sequential finding numbers")
     expect(template).toContain("reuse those same numbers when findings are repeated in Residual Actionable Work")
+
+    const primaryFindingIds = Array.from(
+      fixture.matchAll(/^\| (\d+) \| `[^`]+` \| .* \| .* \| \d+ \| `.*` \|$/gm),
+      ([, id]) => Number(id),
+    )
+    expect(primaryFindingIds).toEqual([1, 2, 3])
+
+    const residualSection = fixture.split("### Residual Actionable Work")[1]
+    const residualIds = Array.from(
+      residualSection.matchAll(/^\| (\d+) \| `[^`]+` \| .* \| `.*` \| .* \|$/gm),
+      ([, id]) => Number(id),
+    )
+    expect(residualIds).toEqual([2, 3])
+    expect(residualIds.every((id) => primaryFindingIds.includes(id))).toBe(true)
   })
 })