diff --git a/CHANGELOG.md b/CHANGELOG.md index 8709c5c0..50810fd6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,7 +21,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed -- **`verify` auto-publishes by default** — successful runs upload to My Verifications as `private`; opt-out with `--no-publish`; pass `--vis unlisted` or `--vis public` to widen visibility. (#425) +- **`verify` auto-publishes by default** — successful runs upload to My Verifications as `private`; opt-out with `--local-only`; pass `--vis unlisted` or `--vis public` to widen visibility. (#425) - **Per-attachment assets in `verify` output** — `verify-response.json` now includes an `attachments` map (pageImages, originalDownload) alongside `verifications` when any attachment returned assets. (#430) - **CDN blink animation synced with React** — CDN now imports `BLINK_*` constants from the same source as the React component, fixing a 180 ms vs 120 ms enter-duration discrepancy; implements the full three-stage enter animation with a `cancelBlink()` guard. (#427) - **Billing copy** — quota-exceeded messages and the dashboard description now reflect subscription tiers (Standard 20/week at $20/mo, Pro unlimited at $200/mo). (#427) diff --git a/src/__tests__/citationParser.test.ts b/src/__tests__/citationParser.test.ts index 5a4a3676..3f283de9 100644 --- a/src/__tests__/citationParser.test.ts +++ b/src/__tests__/citationParser.test.ts @@ -117,6 +117,30 @@ ${CITATION_DATA_END_DELIMITER}`; expect(result.citations[0].source_context).toContain("Line one"); }); + it("repairs literal newlines inside JSON string values", () => { + const response = `Multi-line content [1]. + +${CITATION_DATA_START_DELIMITER} +[ + { + "id": 1, + "attachment_id": "doc789", + "source_context": "Line one +Line two +Line three", + "source_match": "Line two", + "page_id": "page_number_1_index_0" + } +] +${CITATION_DATA_END_DELIMITER}`; + + const result = parseCitationData(response); + + expect(result.success).toBe(true); + expect(result.citations.length).toBe(1); + expect(result.citations[0].source_context).toContain("Line one"); + }); + it("handles multiple citations in single sentence", () => { const response = `Revenue was $1B [1] with profit of $100M [2] in Q4 [3]. @@ -162,6 +186,19 @@ ${CITATION_DATA_START_DELIMITER} expect(result.citations.length).toBe(1); }); + it("handles malformed end delimiter variant", () => { + const response = `Test [1]. + +${CITATION_DATA_START_DELIMITER} +[{"id": 1, "attachment_id": "a", "source_context": "test", "source_match": "test"}] +<<>>`; + + const result = parseCitationData(response); + + expect(result.success).toBe(true); + expect(result.citations.length).toBe(1); + }); + it("handles empty citation block", () => { const response = `No citations here. @@ -175,19 +212,13 @@ ${CITATION_DATA_END_DELIMITER}`; expect(result.citations.length).toBe(0); }); - it("fails when citation block is present but whitespace-only", () => { - // A bare <<>> block with no content is an authoring mistake - // (e.g. unfilled template placeholder), not a legitimate empty result. - // An empty array `[]` is fine; whitespace-only is a failure. - // See plans/noble-skipping-wolf.md for the parallel-agent merge failure this caught. + it("treats whitespace-only citation blocks as recoverable empties", () => { const response = `Body text.\n\n${CITATION_DATA_START_DELIMITER}\n\n${CITATION_DATA_END_DELIMITER}\n`; const result = parseCitationData(response); - expect(result.success).toBe(false); - expect(result.error).toMatch(/empty/i); + expect(result.success).toBe(true); expect(result.citations.length).toBe(0); - // visibleText is still extracted even on failure expect(result.visibleText).toBe("Body text."); }); diff --git a/src/__tests__/cite.test.ts b/src/__tests__/cite.test.ts index 707a7de3..80906f4a 100644 --- a/src/__tests__/cite.test.ts +++ b/src/__tests__/cite.test.ts @@ -39,6 +39,14 @@ describe("extractMarkersFromBody", () => { expect(markers[0].claimText).toBe("Discount Rate"); }); + it("preserves alternate labels for a reused cite id", () => { + const body = "[Horizontal Boundaries](cite:1) and [Vertical Boundaries](cite:1)"; + const markers = extractMarkersFromBody(body); + expect(markers).toHaveLength(1); + expect(markers[0].claimText).toBe("Horizontal Boundaries"); + expect(markers[0].claimTextVariants).toEqual(["Vertical Boundaries"]); + }); + it("extracts double-quoted anchor hint", () => { const body = '[terminates](cite:3 "automatically terminate")'; const markers = extractMarkersFromBody(body); @@ -108,6 +116,14 @@ describe("extractMarkersFromBody", () => { expect(markers).toHaveLength(1); expect(markers[0].claimText).toBe("Rate"); }); + + it("preserves alternate labels for reused bold markers", () => { + const body = "**Horizontal Boundaries** [1] and **Vertical Boundaries** [1]"; + const markers = extractMarkersFromBody(body); + expect(markers).toHaveLength(1); + expect(markers[0].claimText).toBe("Horizontal Boundaries"); + expect(markers[0].claimTextVariants).toEqual(["Vertical Boundaries"]); + }); }); // ── getAllLines ─────────────────────────────────────────────────── diff --git a/src/__tests__/cliCommands.test.ts b/src/__tests__/cliCommands.test.ts index a3eae9cb..395b5cbc 100644 --- a/src/__tests__/cliCommands.test.ts +++ b/src/__tests__/cliCommands.test.ts @@ -7,7 +7,7 @@ */ import { randomBytes } from "node:crypto"; -import { mkdirSync, readFileSync, writeFileSync } from "node:fs"; +import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs"; import { tmpdir } from "node:os"; import { join, resolve } from "node:path"; @@ -567,7 +567,7 @@ describe("prepare command", () => { ); }); - it("prepares a URL", async () => { + it("prints prepared URL JSON to stdout by default", async () => { const tmpDir = makeTmpDir(); const origCwd = process.cwd(); process.chdir(tmpDir); @@ -575,14 +575,20 @@ describe("prepare command", () => { try { mockPrepareUrl.mockResolvedValue({ attachmentId: "att-123", - deepTextPages: ["some text"], + deepTextPages: [`some text`], metadata: { pageCount: 1, textByteSize: 1024 }, }); const { stdout } = await captureOutput(() => prepare(["https://example.com/article"], fmtNetErr)); expect(mockPrepareUrl).toHaveBeenCalledWith(expect.objectContaining({ url: "https://example.com/article" })); - expect(stdout).toContain(".deepcitation/prepare-example.com.json"); + const output = JSON.parse(stdout); + expect(output).toEqual({ + attachmentId: "att-123", + metadata: { pageCount: 1, textByteSize: 1024 }, + deepTextPages: [`some text`], + }); + expect(existsSync(join(tmpDir, ".deepcitation"))).toBe(false); } finally { process.chdir(origCwd); } @@ -608,7 +614,7 @@ describe("prepare command", () => { } }); - it("prints text JSON when --text is used", async () => { + it("writes prepared JSON only when --out is provided", async () => { const tmpDir = makeTmpDir(); const origCwd = process.cwd(); process.chdir(tmpDir); @@ -616,21 +622,28 @@ describe("prepare command", () => { try { mockPrepareUrl.mockResolvedValue({ attachmentId: "att-123", - deepTextPages: ["deep text here"], + deepTextPages: [`deep text here`], metadata: { pageCount: 2, textByteSize: 2048 }, }); - const { stdout } = await captureOutput(() => prepare(["https://example.com/article", "--text"], fmtNetErr)); + const outPath = join(tmpDir, ".deepcitation", "prepare-example.json"); + const { stdout } = await captureOutput(() => + prepare(["https://example.com/article", "--out", outPath], fmtNetErr), + ); - const summary = JSON.parse(stdout); - expect(summary.attachmentId).toBe("att-123"); - expect(summary.deepTextPages).toEqual(["deep text here"]); + expect(stdout).toBe(outPath); + const output = JSON.parse(readFileSync(outPath, "utf-8")); + expect(output.attachmentId).toBe("att-123"); + expect(output.metadata).toEqual({ pageCount: 2, textByteSize: 2048 }); + expect(output.deepTextPages).toEqual([ + `deep text here`, + ]); } finally { process.chdir(origCwd); } }); - it("--text strips and metadata from deepTextPages", async () => { + it("prints tagged text when --format txt is used", async () => { const tmpDir = makeTmpDir(); const origCwd = process.cwd(); process.chdir(tmpDir); @@ -645,16 +658,123 @@ describe("prepare command", () => { metadata: { pageCount: 2, textByteSize: 512 }, }); - const { stdout } = await captureOutput(() => prepare(["https://example.com/doc", "--text"], fmtNetErr)); + const { stdout } = await captureOutput(() => prepare(["https://example.com/doc", "--format", "txt"], fmtNetErr)); + + expect(stdout).toContain(""); + expect(stdout).toContain('Hello world.'); + expect(stdout).toContain(""); + expect(stdout).toContain(""); + expect(stdout).toContain('Page two content.'); + expect(stdout).toContain(""); + } finally { + process.chdir(origCwd); + } + }); + + it("prints cleaned prose when --format plain is used", async () => { + const tmpDir = makeTmpDir(); + const origCwd = process.cwd(); + process.chdir(tmpDir); + + try { + mockPrepareUrl.mockResolvedValue({ + attachmentId: "att-tagged", + deepTextPages: [ + `Hello world.Second line.`, + `Page two content.`, + ], + metadata: { pageCount: 2, textByteSize: 512 }, + }); + + const { stdout } = await captureOutput(() => + prepare(["https://example.com/doc", "--format", "plain"], fmtNetErr), + ); + + expect(stdout).not.toContain(" { + const tmpDir = makeTmpDir(); + const origCwd = process.cwd(); + process.chdir(tmpDir); + + try { + mockPrepareUrl.mockResolvedValue({ + attachmentId: "att-tagged", + deepTextPages: [`Hello world.`], + metadata: { pageCount: 1, textByteSize: 128 }, + }); + + const outPath = join(tmpDir, "evidence.txt"); + const { stdout } = await captureOutput(() => + prepare(["https://example.com/doc", "--format", "txt", "--out", outPath], fmtNetErr), + ); - const summary = JSON.parse(stdout); - expect(summary.deepTextPages[0]).not.toContain("Hello world.`, + ); + } finally { + process.chdir(origCwd); + } + }); + + it("preserves page wrappers when --pages selects a subset", async () => { + const tmpDir = makeTmpDir(); + const origCwd = process.cwd(); + process.chdir(tmpDir); + + try { + mockPrepareUrl.mockResolvedValue({ + attachmentId: "att-pages", + deepTextPages: [ + `Page one.`, + `Page two.`, + ], + metadata: { pageCount: 2, textByteSize: 256 }, + }); + + const { stdout } = await captureOutput(() => prepare(["https://example.com/doc", "--pages", "2"], fmtNetErr)); + const output = JSON.parse(stdout); + + expect(output.deepTextPages).toEqual([ + `Page two.`, + ]); + } finally { + process.chdir(origCwd); + } + }); + + it.each(["--text", "--txt", "--summary"])("rejects removed prepare flag %s", async removedFlag => { + await expect(captureOutput(() => prepare(["https://example.com/doc", removedFlag], fmtNetErr))).rejects.toThrow( + "process.exit(1)", + ); + }); + + it("accepts --out before the source argument", async () => { + const tmpDir = makeTmpDir(); + const origCwd = process.cwd(); + process.chdir(tmpDir); + + try { + mockPrepareUrl.mockResolvedValue({ + attachmentId: "att-ordered", + deepTextPages: [`Ordered.`], + metadata: { pageCount: 1, textByteSize: 64 }, + }); + + const outPath = join(tmpDir, "ordered.json"); + await captureOutput(() => prepare(["--out", outPath, "https://example.com/doc"], fmtNetErr)); + + expect(mockPrepareUrl).toHaveBeenCalledWith(expect.objectContaining({ url: "https://example.com/doc" })); + expect(JSON.parse(readFileSync(outPath, "utf-8")).attachmentId).toBe("att-ordered"); } finally { process.chdir(origCwd); } @@ -717,7 +837,9 @@ describe("prepare command", () => { const { stdout } = await captureOutput(() => prepare([filePath], fmtNetErr)); expect(mockUploadFile).toHaveBeenCalled(); - expect(stdout).toContain("prepare-report.json"); + const output = JSON.parse(stdout); + expect(output.attachmentId).toBe("att-456"); + expect(output.deepTextPages).toEqual(["text"]); } finally { process.chdir(origCwd); } @@ -753,6 +875,10 @@ describe("verify command (--citations mode)", () => { await expect(captureOutput(() => verify(["--citations", citPath], fmtNetErr))).rejects.toThrow("process.exit(1)"); }); + it("rejects deprecated --no-publish and directs callers to --local-only", async () => { + await expect(captureOutput(() => verify(["--no-publish"], fmtNetErr))).rejects.toThrow("process.exit(1)"); + }); + it("verifies citations grouped by attachmentId", async () => { const tmpDir = makeTmpDir(); const origCwd = process.cwd(); @@ -781,12 +907,116 @@ describe("verify command (--citations mode)", () => { const { stderr } = await captureOutput(() => verify(["--citations", citPath], fmtNetErr)); expect(mockVerifyAttachment).toHaveBeenCalledTimes(2); + for (const call of mockVerifyAttachment.mock.calls) { + expect(call[2]).toMatchObject({ requestTimeoutMs: 10_000 }); + } expect(stderr).toContain("3 citations across 2 attachment(s)"); } finally { process.chdir(origCwd); } }); + it("does not auto-promote a display label when it has no overlap with the anchor", async () => { + const tmpDir = makeTmpDir(); + const origCwd = process.cwd(); + process.chdir(tmpDir); + mkdirSync(join(tmpDir, ".deepcitation"), { recursive: true }); + + const mdPath = join(tmpDir, "report.md"); + const outPath = join(tmpDir, "verified.html"); + writeFileSync( + mdPath, + [ + "# Report", + "", + "[**Horizontal Boundaries**](cite:1)", + "", + "<<>>", + JSON.stringify([ + { + id: 1, + attachment_id: "att-1", + source_context: "context", + source_match: "SCHEDULE “C”", + line_ids: [1], + page_id: "1_1", + }, + ]), + "<<>>", + "", + ].join("\n"), + ); + + mockVerifyAttachment.mockResolvedValueOnce({ + verifications: { key1: { status: "not_found" } }, + }); + + try { + const { stderr } = await captureOutput(() => verify(["--markdown", mdPath, "--out", outPath], fmtNetErr)); + expect(mockVerifyAttachment).toHaveBeenCalledTimes(1); + const citationsArg = mockVerifyAttachment.mock.calls[0]?.[1] as Record>; + const firstCitation = Object.values(citationsArg)[0]; + expect(firstCitation).toBeDefined(); + expect(firstCitation).toMatchObject({ + sourceMatch: "SCHEDULE “C”", + }); + expect(stderr).not.toContain("auto-promoted display label to anchor"); + } finally { + process.chdir(origCwd); + } + }); + + it("does not auto-promote a long display label even when it shares a token", async () => { + const tmpDir = makeTmpDir(); + const origCwd = process.cwd(); + process.chdir(tmpDir); + mkdirSync(join(tmpDir, ".deepcitation"), { recursive: true }); + + const mdPath = join(tmpDir, "report.md"); + const outPath = join(tmpDir, "verified.html"); + const longLabel = "The company's revenue grew significantly in Q4 while margins improved"; + writeFileSync( + mdPath, + [ + "# Report", + "", + `[**${longLabel}**](cite:1)`, + "", + "<<>>", + JSON.stringify([ + { + id: 1, + attachment_id: "att-1", + source_context: "context", + source_match: "revenue", + line_ids: [1], + page_id: "1_1", + }, + ]), + "<<>>", + "", + ].join("\n"), + ); + + mockVerifyAttachment.mockResolvedValueOnce({ + verifications: { key1: { status: "not_found" } }, + }); + + try { + const { stderr } = await captureOutput(() => verify(["--markdown", mdPath, "--out", outPath], fmtNetErr)); + expect(mockVerifyAttachment).toHaveBeenCalledTimes(1); + const citationsArg = mockVerifyAttachment.mock.calls[0]?.[1] as Record>; + const firstCitation = Object.values(citationsArg)[0]; + expect(firstCitation).toBeDefined(); + expect(firstCitation).toMatchObject({ + sourceMatch: "revenue", + }); + expect(stderr).not.toContain("auto-promoted display label to anchor"); + } finally { + process.chdir(origCwd); + } + }); + it("writes attachments to output file when verifyAttachment returns assets", async () => { const tmpDir = makeTmpDir(); const origCwd = process.cwd(); diff --git a/src/__tests__/cliText.test.ts b/src/__tests__/cliText.test.ts index 84013054..d61f0e18 100644 --- a/src/__tests__/cliText.test.ts +++ b/src/__tests__/cliText.test.ts @@ -118,7 +118,7 @@ describe("text", () => { expect(code).toBe(1); }); - it("rejects --format json (use prepare --text)", () => { + it("rejects --format json (use prepare default JSON output)", () => { const prep = writePrepareFile(tmp, 1); const code = runAndCatchExit([prep, "-f", "json"]); expect(code).toBe(1); diff --git a/src/__tests__/client.test.ts b/src/__tests__/client.test.ts index f600c2af..981cba54 100644 --- a/src/__tests__/client.test.ts +++ b/src/__tests__/client.test.ts @@ -538,6 +538,65 @@ describe("DeepCitation Client", () => { // Different citations should make separate calls expect(mockFetch).toHaveBeenCalledTimes(2); }); + + it("does not reuse a failed request across different timeouts", async () => { + jest.useFakeTimers(); + try { + const client = new DeepCitation({ apiKey: "sk-dc-test-key-00000001" }); + + mockFetch.mockImplementation((_, init) => { + return new Promise((resolve, reject) => { + const signal = init?.signal as AbortSignal | undefined; + const timer = setTimeout(() => { + resolve({ + ok: true, + json: async () => ({ + verifications: { "1": { status: "found" } }, + }), + } as Response); + }, 10); + + const abort = () => { + clearTimeout(timer); + reject(signal?.reason ?? new DOMException("Aborted", "AbortError")); + }; + + if (signal?.aborted) { + abort(); + return; + } + + signal?.addEventListener("abort", abort, { once: true }); + }); + }); + + const citations = { + "1": { + pageNumber: 1, + sourceContext: "test phrase", + attachmentId: "file_abc", + }, + }; + + const shortTimeout = client.verifyAttachment("file_abc", citations, { requestTimeoutMs: 1 }); + jest.advanceTimersByTime(1); + await Promise.resolve(); + await expect(shortTimeout).rejects.toThrow("Request timed out after 1ms"); + + const longTimeout = client.verifyAttachment("file_abc", citations, { requestTimeoutMs: 100 }); + jest.advanceTimersByTime(10); + await Promise.resolve(); + await expect(longTimeout).resolves.toMatchObject({ + verifications: { + "1": { status: "found" }, + }, + }); + + expect(mockFetch).toHaveBeenCalledTimes(2); + } finally { + jest.useRealTimers(); + } + }); }); describe("verifyBatch", () => { @@ -1307,5 +1366,27 @@ describe("DeepCitation Client", () => { // Only one fetch attempt — aborted during the delay before the second attempt expect(mockFetch).toHaveBeenCalledTimes(1); }); + + it("does not retry when fetch rejects with AbortError", async () => { + const client = new DeepCitation({ apiKey: "sk-dc-test-key-00000001", maxRetries: 3 }); + + mockFetch.mockRejectedValueOnce(new DOMException("Aborted", "AbortError")); + + const blob = new Blob(["content"]); + await expect(client.uploadFile(blob, { filename: "test.pdf" })).rejects.toThrow("Aborted"); + + expect(mockFetch).toHaveBeenCalledTimes(1); + }); + + it("does not retry when fetch rejects with TimeoutError", async () => { + const client = new DeepCitation({ apiKey: "sk-dc-test-key-00000001", maxRetries: 3 }); + + mockFetch.mockRejectedValueOnce(new DOMException("Request timed out after 50ms", "TimeoutError")); + + const blob = new Blob(["content"]); + await expect(client.uploadFile(blob, { filename: "test.pdf" })).rejects.toThrow("timed out"); + + expect(mockFetch).toHaveBeenCalledTimes(1); + }); }); }); diff --git a/src/__tests__/markdownToHtml.test.ts b/src/__tests__/markdownToHtml.test.ts index a92fde6f..0b993cc0 100644 --- a/src/__tests__/markdownToHtml.test.ts +++ b/src/__tests__/markdownToHtml.test.ts @@ -37,6 +37,13 @@ describe("wrapCitationMarkers", () => { expect(result).toContain('href="https://example.com"'); }); + it("preserves tags with > inside quoted attributes", () => { + const html = '

Claim [1]

'; + const result = wrapCitationMarkers(html); + expect(result).toContain(''); + expect(result).toContain('Claim'); + }); + it("anchors to the last clause when text contains punctuation", () => { const html = "

Overall, revenue grew significantly [1]

"; const result = wrapCitationMarkers(html); @@ -44,12 +51,10 @@ describe("wrapCitationMarkers", () => { expect(result).toContain("revenue grew significantly"); }); it("emits empty span for punctuation-only anchors", () => { - // Schedule "C" produces an anchor of just `"` after the regex cuts at the quote const html = '

Schedule "C" [1]

'; const result = wrapCitationMarkers(html); expect(result).toContain('data-cite="1"'); - // The span should have no inner text content (empty anchor) - expect(result).toMatch(/<\/span>/); + expect(result).toContain('Schedule "C"'); }); }); @@ -110,6 +115,21 @@ describe("markdownToHtml block parsing", () => { expect(result).toContain("

"); }); + it("renders headings with CRLF line endings", () => { + const result = markdownToHtml("# Title\r\n\r\n## Section\r\n\r\n### Sub", { style: "plain" }); + expect(result).toContain("

"); + expect(result).toContain("

"); + expect(result).toContain("

"); + }); + + it("renders headings with up to three leading spaces", () => { + const result = markdownToHtml(" # Title\n\n ## Section\n\n ### Sub", { style: "plain" }); + expect(result).toContain("

"); + expect(result).toContain("

"); + expect(result).toContain("

"); + expect(result).not.toContain("

### Sub

"); + }); + it("renders paragraphs", () => { const result = markdownToHtml("Hello world.\n\nSecond paragraph.", { style: "plain" }); expect(result).toContain("

Hello world.

"); @@ -195,7 +215,6 @@ describe("markdownToHtml style shells", () => { const mdWithSections = "# Report\n\n## Key Findings\n\nImportant stuff.\n\n## Details\n\nMore details."; const result = markdownToHtml(mdWithSections, { style: "report" }); expect(result).toContain(""); - expect(result).toContain("dc-verdict"); expect(result).toContain("data-dc-drawer-trigger"); }); @@ -325,6 +344,14 @@ describe("markdownToHtml report body (flat rendering)", () => { const sectionPos = result.indexOf("Content."); expect(preamblePos).toBeLessThan(sectionPos); }); + + it("does not render left-gutter section numbers in the report shell", () => { + const result = markdownToHtml("# Report\n\n## One\n\n### Two", { style: "report" }); + expect(result).not.toContain('content: "00"'); + expect(result).not.toContain("counter-reset: h2section"); + expect(result).not.toContain("h2::before"); + expect(result).not.toContain("h3::before"); + }); }); // ── markdownToHtml — citation marker integration ────────────────── @@ -487,20 +514,17 @@ describe("markdownToHtml header — claim & model", () => { it("renders a claim card when claim is provided", () => { const result = markdownToHtml("# T\nbody", { claim: "Did revenue exceed $4B?" }); expect(result).toContain('class="dc-claim"'); - expect(result).toContain(">CLAIM<"); expect(result).toContain("Did revenue exceed $4B?"); }); it("omits the claim card when claim is absent", () => { const result = markdownToHtml("# T\nbody", {}); expect(result).not.toContain('
CLAIM<"); }); it("suppresses a whitespace-only claim", () => { const result = markdownToHtml("# T\nbody", { claim: " " }); expect(result).not.toContain('
CLAIM<"); }); it("escapes HTML in the claim", () => { diff --git a/src/__tests__/parseCitation.test.ts b/src/__tests__/parseCitation.test.ts index bb08907b..74bd595f 100644 --- a/src/__tests__/parseCitation.test.ts +++ b/src/__tests__/parseCitation.test.ts @@ -1292,6 +1292,18 @@ describe("getAllCitationsFromLlmOutput", () => { }); describe("deferred JSON <<>> format", () => { + it("treats an explicit empty block as a parse error", () => { + const input = `Text before. + +<<>> + +<<>>`; + + const result = getAllCitationsFromLlmOutput(input); + + expect(result).toEqual({}); + }); + it("extracts citations from exact user failing scenario with 14 citations", () => { // This is the EXACT failing scenario from the user const input = `Here's a summary of the medical document for John Doe: @@ -1467,6 +1479,23 @@ Patient Profile: expect(citation.lineIds).toEqual([5, 6]); }); + it("keeps CRLF line breaks intact when repairing JSON strings", () => { + const input = `Line one [1]. + +<<>> +[ + {"id": 1, "source_context": "Line 1\r\nLine 2", "source_match": "Line 1", "page_id": "1_0", "line_ids": [1]} +] +<<>>`; + + const result = getAllCitationsFromLlmOutput(input); + + expect(Object.keys(result).length).toBe(1); + const citation = Object.values(result)[0]; + expect(citation.sourceContext).toBe("Line 1\nLine 2"); + expect(citation.sourceContext).not.toContain("\n\n"); + }); + it("extracts citations from deferred JSON format with camelCase keys", () => { const input = `Camel case [1]. diff --git a/src/cli/cite.ts b/src/cli/cite.ts index 309e3fe6..4c8c5af2 100644 --- a/src/cli/cite.ts +++ b/src/cli/cite.ts @@ -81,6 +81,8 @@ export function getAllLines(lineMap: LineMap): LineEntry[] { export interface BodyMarker { id: number; claimText: string; + /** Alternate labels that reused the same citation ID. */ + claimTextVariants?: string[]; /** Verbatim anchor text from the evidence, if provided via title syntax. */ anchorHint?: string; } @@ -102,26 +104,26 @@ export interface BodyMarker { export function extractMarkersFromBody(body: string): BodyMarker[] { // Match [label](cite:N ...) — capture everything inside the parens after cite:N const re = /\[([^\][]+)\]\(cite:(\d+)((?:\s+[^)]*)?)\)/g; - const seen = new Map(); // id → first display label - const results: BodyMarker[] = []; + const seen = new Map(); // id → first marker, with alternates preserved let m: RegExpExecArray | null; while ((m = safeExec(re, body)) !== null) { const label = m[1].trim(); const id = parseInt(m[2], 10); const rest = m[3]?.trim() ?? ""; - if (seen.has(id)) { - if (seen.get(id) !== label) { + const existing = seen.get(id); + if (existing) { + if (existing.claimText !== label && !(existing.claimTextVariants?.includes(label) ?? false)) { console.error( ` Warning: cite:${id} reused with different label — ` + - `"${sanitizeForLog(seen.get(id) ?? "")}" (used) vs "${sanitizeForLog(label)}" (ignored). ` + + `"${sanitizeForLog(existing.claimText)}" (used) vs "${sanitizeForLog(label)}" (stored as variant). ` + `Each distinct claim must use a unique ID.`, ); + existing.claimTextVariants ??= []; + existing.claimTextVariants.push(label); } continue; } - seen.set(id, label); - const marker: BodyMarker = { id, claimText: label }; // Parse optional anchor hint (single or double quoted) @@ -130,32 +132,34 @@ export function extractMarkersFromBody(body: string): BodyMarker[] { const anchorRaw = anchorDQ?.[1] ?? anchorSQ?.[1]; if (anchorRaw?.trim()) marker.anchorHint = anchorRaw.trim(); - results.push(marker); + seen.set(id, marker); } // Fallback: **bold text** [N] markers (Strategy 2c format). // Only used when no [text](cite:N) markers were found. - if (results.length === 0) { + if (seen.size === 0) { const boldRe = /\*\*([^*]+)\*\*\s*\[(\d+)\]/g; let bm: RegExpExecArray | null; while ((bm = safeExec(boldRe, body)) !== null) { const label = bm[1].trim(); const id = parseInt(bm[2], 10); - if (seen.has(id)) { - if (seen.get(id) !== label) { + const existing = seen.get(id); + if (existing) { + if (existing.claimText !== label && !(existing.claimTextVariants?.includes(label) ?? false)) { console.error( ` Warning: [${id}] reused with different label — ` + - `"${sanitizeForLog(seen.get(id) ?? "")}" (used) vs "${sanitizeForLog(label)}" (ignored). ` + + `"${sanitizeForLog(existing.claimText)}" (used) vs "${sanitizeForLog(label)}" (stored as variant). ` + `Each distinct claim must use a unique ID.`, ); + existing.claimTextVariants ??= []; + existing.claimTextVariants.push(label); } continue; } - seen.set(id, label); - results.push({ id, claimText: label }); + seen.set(id, { id, claimText: label }); } } - return results.sort((a, b) => a.id - b.id); + return [...seen.values()].sort((a, b) => a.id - b.id); } /** Generic words skipped by Strategy 3 to avoid wrong-context single-word matches. */ diff --git a/src/cli/commands.ts b/src/cli/commands.ts index 52b42182..1f91207a 100644 --- a/src/cli/commands.ts +++ b/src/cli/commands.ts @@ -44,7 +44,6 @@ import { sanitizeForLog } from "../utils/logSafety.js"; import { normalizeCitationsFile } from "../utils/normalizeCitations.js"; import { detectProxyUrl } from "../utils/proxy.js"; import { safeExec, safeReplace, safeTest } from "../utils/regexSafety.js"; -import { cleanDeepTextPage } from "../utils/textCleanup.js"; import { validateCitationData } from "../utils/validateCitationData.js"; import { CDN_JS } from "../vanilla/_generated_cdn.js"; import { @@ -60,7 +59,6 @@ import { die, extractApiKey, isValidApiKeyFormat, normalizeShortFlags, parseArgs import { findSummaryForMarkdown, hydrateCitations, parseSummaryToLineMap } from "./hydrate.js"; import { generateReviewVariants, markdownToHtml, type ReportStyle } from "./markdownToHtml.js"; import { createCoworkFetch, createProxyFetch } from "./proxy.js"; -import type { TextFormat } from "./textRender.js"; import { applyLineIds, parseFormatMode, parseLineIdsMode, renderTextStream, resolvePageSpec } from "./textRender.js"; // Re-export so cli.ts and tests can import from the single commands module @@ -118,23 +116,17 @@ Examples: export const PREPARE_HELP = `Usage: deepcitation prepare [options] Prepare a file or URL for citation verification. Uploads the source to the -DeepCitation API and saves the response JSON (attachmentId + deepTextPages). +DeepCitation API and prints the prepared output to stdout by default. Arguments: Local file path or URL to prepare Options: - --out, -o Output path (default: .deepcitation/prepare-{name}.json - for JSON mode, .deepcitation/{name}.txt for --txt mode) - --text Print cleaned {attachmentId, deepTextPages} JSON to stdout. - Backward-compatible default: strips / - tags unless --line-ids is also passed. - --txt Write tagged text to .deepcitation/{name}.txt (LLM default). - Equivalent to --format txt with default line-id sampling. + --out, -o Write output to file instead of stdout --format, -f Output format override: "json" | "txt" | "plain" - - json: {attachmentId, deepTextPages} JSON + - json: {attachmentId, metadata, deepTextPages} JSON - txt: raw deepTextPages with and - tags (what citation authoring wants) + tags - plain: page text with all tags stripped, pages joined by "\\n\\n" --line-ids, -l Line-ID tag sampling: "default" | "none" | "every=N" - default: every-5 + first/last (server default) @@ -150,11 +142,11 @@ Options: Examples: deepcitation prepare report.pdf - deepcitation prepare report.pdf --txt # LLM default: tagged text to .txt file - deepcitation prepare report.pdf --txt -p 1-10 # only first 10 pages - deepcitation prepare report.pdf --text # back-compat: cleaned JSON to stdout - deepcitation prepare report.pdf --text -f txt # stdout: tagged text instead of JSON - deepcitation prepare https://example.com/article --txt + deepcitation prepare report.pdf --out .deepcitation/prepare-report.json + deepcitation prepare report.pdf --format txt # prompt-ready tagged text to stdout + deepcitation prepare report.pdf --format txt -p 1-10 # only first 10 pages + deepcitation prepare report.pdf --format plain + deepcitation prepare https://example.com/article --format txt `; export const VERIFY_HELP = `Usage: deepcitation verify [options] @@ -186,7 +178,7 @@ Options: --out Output path (default: {stem}-verified.html in CWD) --output-dir Save HTML and verify-response.json to this directory with stable names --json, --keep-json Also write {stem}-verify-response.json next to the HTML (debug/publish) - --no-publish Skip the auto-upload to My Verifications. Default is to publish as private. + --local-only Skip the auto-upload to My Verifications. --vis, --visibility Published visibility: private | unlisted | public (default: private) --theme Popover color theme (default: "auto") --indicator Indicator variant: icon, dot, none (default: "icon") @@ -195,12 +187,12 @@ Options: -h, --help Show this help message Examples: - deepcitation verify --md .deepcitation/draft-report.md # auto-publishes as private - deepcitation verify --md report.md --claim "Did Q1 revenue exceed $4B?" --model "Claude Haiku 4.5" + deepcitation verify --md .deepcitation/draft-report.md + deepcitation verify --md report.md --claim "Did Q1 revenue exceed $4B?" deepcitation verify --md report.md --style plain deepcitation verify --md report.md --vis unlisted # shareable by link deepcitation verify --md report.md --vis public # (Portal session only) - deepcitation verify --md report.md --no-publish # local-only, don't upload + deepcitation verify --md report.md --local-only deepcitation verify --html report.html --out verified.html deepcitation verify --prompt deepcitation verify --citations .deepcitation/citations-keyed.json @@ -249,10 +241,62 @@ Examples: const ALLOWED_THEMES = ["auto", "light", "dark"] as const; const ALLOWED_INDICATORS = ["icon", "dot", "none"] as const; +const VERIFY_REQUEST_TIMEOUT_MS = 10_000; +const MAX_AUTO_PROMOTE_LABEL_LENGTH = 60; + +function hasMeaningfulLabelOverlap(left: string, right: string): boolean { + const tokenRe = /[a-z0-9]+/g; + const leftTokens = new Set((left.toLowerCase().match(tokenRe) ?? []).filter(token => token.length >= 3)); + const rightTokens = [...(right.toLowerCase().match(tokenRe) ?? [])].filter(token => token.length >= 3); + if (leftTokens.size === 0 || rightTokens.length === 0) return false; + if (right.trim().length > MAX_AUTO_PROMOTE_LABEL_LENGTH) return false; + + let overlapCount = 0; + for (const token of rightTokens) { + if (leftTokens.has(token)) { + overlapCount++; + if (overlapCount >= 1) { + return true; + } + } + } + + return false; +} // ── helpers ─────────────────────────────────────────────────────── const DEFAULT_API_URL = "https://api.deepcitation.com"; +const PREPARE_VALUE_FLAGS = new Set(["--out", "--format", "--line-ids", "--pages"]); + +function findPrepareSource(argv: string[]): string | undefined { + for (let i = 0; i < argv.length; i++) { + const arg = argv[i]; + if (!arg) continue; + if (PREPARE_VALUE_FLAGS.has(arg)) { + i++; + continue; + } + if (!arg.startsWith("-")) return arg; + } + return undefined; +} + +function prepareMigrationError(flag: string): never { + die( + `${flag} is no longer supported by prepare. ` + + `Use the default JSON output, --format txt, --format plain, and/or --out .`, + PREPARE_HELP, + ); +} + +function writePrepareFile(outPath: string, body: string): void { + const parent = dirname(outPath); + if (!existsSync(parent)) mkdirSync(parent, { recursive: true }); + writeFileSync(outPath, body); // lgtm[js/http-to-file-access] + console.error(` Saved: ${outPath}`); + console.log(outPath); +} export function canStartBrowserAuth(argv: string[] = []): boolean { // --browser is an explicit opt-in that starts the OAuth flow even in constrained @@ -438,26 +482,22 @@ export async function prepare(argv: string[], _fmtNetErr: (err: unknown) => stri // Extract boolean flags before parseArgs (which only handles --key value pairs) const unsafeFast = normalized.includes("--unsafe-fast"); - const textFlag = normalized.includes("--text") || normalized.includes("--summary"); - const txtFlag = normalized.includes("--txt"); const skipCache = normalized.includes("--skip-cache"); - const booleans = new Set(["--unsafe-fast", "--text", "--summary", "--skip-cache", "--txt"]); + for (const removedFlag of ["--text", "--txt", "--summary"]) { + if (normalized.includes(removedFlag)) prepareMigrationError(removedFlag); + } + const booleans = new Set(["--unsafe-fast", "--skip-cache"]); const filteredArgv = normalized.filter(a => !booleans.has(a)); const args = parseArgs(filteredArgv, PREPARE_HELP); - // Positional argument: first non-flag token. - const positional = filteredArgv.find(a => !a.startsWith("--")); + // Positional argument: first non-flag token that is not a flag value. + const positional = findPrepareSource(filteredArgv); if (!positional) die("A file path or URL is required", PREPARE_HELP); // Validate format + line-id flags up-front so bad input fails before the API call. const lineIdsMode = parseLineIdsMode(args["line-ids"], PREPARE_HELP); - const fallbackFormat: TextFormat = txtFlag ? "txt" : "json"; - const format = parseFormatMode(args.format, fallbackFormat, PREPARE_HELP); - - if (txtFlag && format === "json") { - die("--txt conflicts with --format json; drop --txt or pass --format txt/plain", PREPARE_HELP); - } + const format = parseFormatMode(args.format, "json", PREPARE_HELP); const { apiKey } = await requireAuth(); const dc = await createClient(apiKey); @@ -468,16 +508,13 @@ export async function prepare(argv: string[], _fmtNetErr: (err: unknown) => stri } let result; - let label: string; if (isUrl) { - label = new URL(positional).hostname.replace(/^www\./, ""); console.error(unsafeFast ? `Preparing URL (fast mode)...` : `Preparing URL (this may take ~30s)...`); result = await dc.prepareUrl({ url: positional, unsafeFastUrlOutput: unsafeFast, skipCache }); } else { const filePath = resolve(positional); if (!existsSync(filePath)) die(`File not found: ${positional}`, PREPARE_HELP); - label = basename(filePath).replace(/\.[^.]+$/, ""); console.error(`Preparing file: ${basename(filePath)}...`); const buffer = readFileSync(filePath); result = await dc.uploadFile(buffer, { filename: basename(filePath) }); @@ -486,61 +523,35 @@ export async function prepare(argv: string[], _fmtNetErr: (err: unknown) => stri const pickedIndices = resolvePageSpec(args.pages, result.deepTextPages.length, PREPARE_HELP); const selectedPages = pickedIndices.map(i => result.deepTextPages[i] as string); - const outDir = resolve(".deepcitation"); - if (!existsSync(outDir)) mkdirSync(outDir, { recursive: true }); - - // --txt mode: write tagged text to a .txt file (LLM default). - if (txtFlag) { - const txtPath = resolve(args.out ?? `.deepcitation/${label}.txt`); - const body = renderTextStream(selectedPages, format === "json" ? "txt" : format, lineIdsMode); - writeFileSync(txtPath, body); // lgtm[js/http-to-file-access] - console.error(` Attachment ID: ${sanitizeForLog(result.attachmentId)}`); - console.error( - ` Pages: ${pickedIndices.length}${pickedIndices.length !== result.metadata.pageCount ? ` / ${result.metadata.pageCount}` : ""}`, - ); - console.error(` Text: ${Math.round(result.metadata.textByteSize / 1024)}KB`); - if (result.processingTimeMs) { - console.error(` Time: ${(result.processingTimeMs / 1000).toFixed(1)}s`); - } - console.error(` Saved: ${txtPath}`); - console.log(txtPath); - return; - } - - // Default path: write the full prepare response as JSON to disk. - const outPath = resolve(args.out ?? `.deepcitation/prepare-${label}.json`); - writeFileSync(outPath, JSON.stringify(result, null, 2)); // lgtm[js/http-to-file-access] + const body = + format === "json" + ? JSON.stringify( + { + attachmentId: result.attachmentId, + metadata: result.metadata, + deepTextPages: selectedPages.map(page => applyLineIds(page, lineIdsMode)), + }, + null, + 2, + ) + : renderTextStream(selectedPages, format, lineIdsMode); console.error(` Attachment ID: ${sanitizeForLog(result.attachmentId)}`); - console.error(` Pages: ${result.metadata.pageCount}`); + console.error( + ` Pages: ${pickedIndices.length}${pickedIndices.length !== result.metadata.pageCount ? ` / ${result.metadata.pageCount}` : ""}`, + ); console.error(` Text: ${Math.round(result.metadata.textByteSize / 1024)}KB`); if (result.processingTimeMs) { console.error(` Time: ${(result.processingTimeMs / 1000).toFixed(1)}s`); } - console.error(` Saved: ${outPath}`); - if (textFlag) { - if (format === "txt" || format === "plain") { - // Stream tagged or plain text to stdout instead of JSON. - process.stdout.write(renderTextStream(selectedPages, format, lineIdsMode)); - process.stdout.write("\n"); - return; - } - // json format — back-compat path. When no --line-ids flag is passed, strip tags - // (current behavior). When --line-ids is explicit, honor it. - const pagesForJson = - args["line-ids"] === undefined - ? selectedPages.map(cleanDeepTextPage) - : selectedPages.map(p => applyLineIds(p, lineIdsMode)); - console.log( - JSON.stringify({ - attachmentId: result.attachmentId, - deepTextPages: pagesForJson, - }), - ); - } else { - console.log(outPath); + if (args.out) { + writePrepareFile(resolve(args.out), body); + return; } + + process.stdout.write(body); + if (!body.endsWith("\n")) process.stdout.write("\n"); } export async function verify( @@ -549,6 +560,9 @@ export async function verify( resolveSpecPath?: () => string | null, ) { argv = normalizeShortFlags(argv); + if (argv.includes("--no-publish")) { + die("--no-publish is no longer supported. Use --local-only to skip auto-upload to My Verifications.", VERIFY_HELP); + } // Handle --prompt before parseArgs (it's a boolean flag, not a key-value pair) if (argv.includes("--prompt")) { if (resolveSpecPath) { @@ -631,7 +645,7 @@ export async function verify( // Cast: CLI reads citations from JSON files as Record>, // but verifyAttachment expects its own typed CitationMap. The shapes match at runtime. groupCitations as unknown as Parameters[1], - { outputImageFormat: imageFormat }, + { outputImageFormat: imageFormat, requestTimeoutMs: VERIFY_REQUEST_TIMEOUT_MS }, ); Object.assign(merged, result.verifications); // Preserve per-attachment assets (pageImages, originalDownload) so downstream @@ -883,9 +897,19 @@ export async function verifyMarkdown(argv: string[], fmtNetErr: (err: unknown) = const allLines = getAllLines(lineMap); const citations: CitationData[] = []; - for (const { id, claimText, anchorHint } of markers) { - const searchTerm = anchorHint ?? claimText; - const found = findAnchorWithFallback(searchTerm, allLines); + for (const { id, claimText, claimTextVariants, anchorHint } of markers) { + const searchTerms = anchorHint + ? [anchorHint, claimText, ...(claimTextVariants ?? [])] + : [claimText, ...(claimTextVariants ?? [])]; + let found: ReturnType | null = null; + let usedSearchTerm: string | undefined; + for (const searchTerm of searchTerms) { + found = findAnchorWithFallback(searchTerm, allLines); + if (found) { + usedSearchTerm = searchTerm; + break; + } + } if (!found) { console.error(` Citation ${id} ("${claimText}"): not found in evidence`); continue; @@ -898,7 +922,8 @@ export async function verifyMarkdown(argv: string[], fmtNetErr: (err: unknown) = page_id: toCompactPageId(pageId), line_ids: [lineId], attachment_id: attachmentId, - claim_text: claimText.toLowerCase() !== sourceMatch.toLowerCase() ? claimText : undefined, + claim_text: + usedSearchTerm && usedSearchTerm.toLowerCase() !== sourceMatch.toLowerCase() ? usedSearchTerm : undefined, }); } @@ -1095,7 +1120,7 @@ export async function verifyHtml(argv: string[], _fmtNetErr: (err: unknown) => s // Boolean flags — filter out before parseArgs (which only handles --key value pairs). // --publish / --pub are no-op opt-ins kept for backwards-compat: auto-publish - // is now the default and only needs to be suppressed with --no-publish. + // is now the default and only needs to be suppressed with --local-only. const keepJson = normalized.includes("--json") || normalized.includes("--keep-json"); const booleanFlags = new Set(["--json", "--keep-json"]); const filteredArgv = normalized.filter(a => !booleanFlags.has(a)); @@ -1116,14 +1141,14 @@ export async function verifyHtml(argv: string[], _fmtNetErr: (err: unknown) => s die(`No valid <<>> block found in the ${src} file.`, VERIFY_HELP); } - // 1b. When the model picked a short bold display label that differs from - // source_match, promote the bold text to anchor — it's what the reader - // clicks and should drive the highlight. Mutates `parsed.citations` - // before the verify API call. + // 1b. When the model picked a short bold display label that still overlaps + // the existing source_match, promote the bold text to anchor — it's what + // the reader clicks and should drive the highlight. Mutates + // `parsed.citations` before the verify API call. { + const labelsById = new Map>(); const spanRe = /<([a-zA-Z][a-zA-Z0-9]*)\s+[^>]*data-cite="(\d+)"[^>]*>([\s\S]*?)<\/\1>/g; let m: RegExpExecArray | null; - let promoted = 0; while ((m = safeExec(spanRe, parsed.visibleText)) !== null) { const id = parseInt(m[2], 10); // Strip nested tags in one pass. data-cite spans wrap at most a single @@ -1139,13 +1164,24 @@ export async function verifyHtml(argv: string[], _fmtNetErr: (err: unknown) => s visible = visible.replace(/\s+/g, " ").trim(); if (!visible) continue; - const wordCount = visible.split(/\s+/).length; - if (wordCount > 4 || visible.length > 40) continue; + let labels = labelsById.get(id); + if (!labels) { + labels = new Set(); + labelsById.set(id, labels); + } + labels.add(visible); + } + let promoted = 0; + for (const [id, labels] of labelsById.entries()) { + if (labels.size !== 1) continue; + const [visible] = [...labels]; const cd = parsed.citations.find(c => c.id === id); if (!cd) continue; + const currentAnchor = (cd.source_match ?? "").trim(); - if (currentAnchor && currentAnchor.toLowerCase() === visible.toLowerCase()) continue; + if (!currentAnchor || currentAnchor.toLowerCase() === visible.toLowerCase()) continue; + if (!hasMeaningfulLabelOverlap(currentAnchor, visible)) continue; console.error( ` [${id}] auto-promoted display label to anchor: "${visible}" (was "${currentAnchor.slice(0, 40)}${currentAnchor.length > 40 ? "…" : ""}")`, @@ -1283,7 +1319,7 @@ export async function verifyHtml(argv: string[], _fmtNetErr: (err: unknown) => s attachmentId, // Cast: same as verify command — JSON-parsed citations → typed CitationMap groupCitations as unknown as Parameters[1], - { outputImageFormat: imageFormat }, + { outputImageFormat: imageFormat, requestTimeoutMs: VERIFY_REQUEST_TIMEOUT_MS }, ); Object.assign(merged, result.verifications); // Invariant: each attachmentId belongs to exactly one group, so result.attachments diff --git a/src/cli/hydrate.ts b/src/cli/hydrate.ts index 4e20a6da..a359a07d 100644 --- a/src/cli/hydrate.ts +++ b/src/cli/hydrate.ts @@ -32,13 +32,13 @@ Use this when the draft was generated with the compact citation format Options: --markdown Path to draft markdown file with <<>> block - --summary Path to summary file from "deepcitation prepare --text" + --summary Path to JSON summary file from "deepcitation prepare --out" --out Output path (default: overwrites --markdown input) -h, --help Show this help message Examples: - deepcitation hydrate --markdown .deepcitation/draft.md --summary .deepcitation/summary-report.txt - deepcitation hydrate --markdown .deepcitation/draft.md --summary .deepcitation/summary-report.txt --out .deepcitation/draft-hydrated.md + deepcitation hydrate --markdown .deepcitation/draft.md --summary .deepcitation/prepare-report.json + deepcitation hydrate --markdown .deepcitation/draft.md --summary .deepcitation/prepare-report.json --out .deepcitation/draft-hydrated.md `; /** @@ -63,7 +63,7 @@ export interface LineMap { } export interface HydrateOptions { - /** Raw content of the summary file (JSON string from deepcitation prepare --text) */ + /** Raw content of the summary file (JSON string from deepcitation prepare --out) */ summaryContent: string; /** Citations to hydrate in place — source_context is mutated on matching entries */ citations: CitationData[]; @@ -399,7 +399,7 @@ export function hydrateCitations({ summaryContent, citations, warnOnMiss }: Hydr * * Search order (most reliable first): * 1. `.deepcitation/prepare-*.json` — pure JSON output from `deepcitation prepare` - * 2. `.deepcitation/summary-*.txt` — text+JSON output from `prepare --text` + * 2. `.deepcitation/summary-*.txt` — legacy text+JSON output from `prepare --text` * * When `attachmentId` is provided, scans each candidate and returns the first one * whose JSON contains a matching `attachmentId`. This prevents the wrong evidence diff --git a/src/cli/lint.ts b/src/cli/lint.ts index 6f0c992a..44a68185 100644 --- a/src/cli/lint.ts +++ b/src/cli/lint.ts @@ -10,7 +10,7 @@ import { existsSync, readFileSync } from "node:fs"; import { resolve } from "node:path"; -import { parseCitationData } from "../parsing/citationParser.js"; +import { hasWhitespaceOnlyCitationBlock, parseCitationData } from "../parsing/citationParser.js"; import type { CitationData } from "../prompts/citationPrompts.js"; import { CITATION_DATA_START_DELIMITER } from "../prompts/citationPrompts.js"; import { sanitizeForLog } from "../utils/logSafety.js"; @@ -135,6 +135,15 @@ function runChecks(content: string): Finding[] { return findings; } + if (hasWhitespaceOnlyCitationBlock(content)) { + findings.push({ + severity: "ERR", + rule: "parse", + message: "empty <<>> block", + }); + return findings; + } + const parsed = parseCitationData(content); if (!parsed.success) { findings.push({ diff --git a/src/cli/markdownToHtml.ts b/src/cli/markdownToHtml.ts index 96aa409d..80596a7f 100644 --- a/src/cli/markdownToHtml.ts +++ b/src/cli/markdownToHtml.ts @@ -115,28 +115,25 @@ export interface CitationSourceMatchMap { [citationId: string]: string; } -/** - * Find [N] markers in HTML content and wrap the appropriate text fragment - * in a . The CDN runtime needs data-cite on inline - * elements for indicator placement. - * - * When `sourceMatchMap` is provided, the sourceMatch for each citation is used as - * the clickable display label. The function searches backward in the text - * before [N] for the sourceMatch (case-insensitive) and wraps only that - * occurrence. This produces short, scannable inline citations that match - * the evidence highlight. - * - * Without `sourceMatchMap`, falls back to wrapping the last clause before [N]. - */ -export function wrapCitationMarkers(html: string, sourceMatchMap?: CitationSourceMatchMap): string { - // Match [N] markers anywhere in text nodes. Excluding `<` and `>` keeps us from - // consuming HTML tag boundaries; excluding `"` keeps us out of quoted attribute values. - return html.replace(/([^<>"]*?)\s*\[(\d+)\]/g, (_match, textBefore: string, num: string) => { +function wrapCitationMarkerTextSegment(text: string, sourceMatchMap?: CitationSourceMatchMap): string { + let out = ""; + let cursor = 0; + const markerRe = /\[(\d+)\]/g; + let match: RegExpExecArray | null; + + while ((match = markerRe.exec(text)) !== null) { + const markerStart = match.index; + const markerEnd = markerStart + match[0].length; + const textBefore = text.slice(cursor, markerStart); + const num = match[1]; const trimmed = textBefore.trimEnd(); - if (!trimmed) return ``; - // ── Strategy 1: Use sourceMatch from citation data ───────────── - // Find the sourceMatch within the preceding text and wrap only that phrase. + if (!trimmed) { + out += textBefore + ``; + cursor = markerEnd; + continue; + } + const sourceMatch = sourceMatchMap?.[num]; if (sourceMatch) { const idx = trimmed.toLowerCase().lastIndexOf(sourceMatch.toLowerCase()); @@ -144,20 +141,18 @@ export function wrapCitationMarkers(html: string, sourceMatchMap?: CitationSourc const before = trimmed.slice(0, idx); const matched = trimmed.slice(idx, idx + sourceMatch.length); const after = trimmed.slice(idx + sourceMatch.length); - return `${before}${matched}${after}`; + out += before + `${matched}` + after; + cursor = markerEnd; + continue; } - // sourceMatch not found in text — fall through to heuristic } - // ── Strategy 2: Heuristic — last clause before [N] ─────────── const clauseMatch = trimmed.match(/(?:[,;–—]\s*)([^,;–—]+)$/); const anchor = clauseMatch ? clauseMatch[1].trim() : trimmed; - - // If the anchor is only punctuation (e.g. the [^<"] regex cut off at a - // literal quote in text content like Schedule "C".), emit an empty span - // so the CDN shows a superscript indicator instead of wrapping garbage. if (!/[a-zA-Z0-9]/.test(anchor)) { - return `${trimmed}`; + out += `${textBefore}`; + cursor = markerEnd; + continue; } const prefix = clauseMatch @@ -165,8 +160,35 @@ export function wrapCitationMarkers(html: string, sourceMatchMap?: CitationSourc clauseMatch[0].slice(0, clauseMatch[0].length - anchor.length) : ""; - return `${prefix}${anchor}`; - }); + out += `${prefix}${anchor}`; + cursor = markerEnd; + } + + return out + text.slice(cursor); +} + +/** + * Find [N] markers in HTML content and wrap the appropriate text fragment + * in a . The CDN runtime needs data-cite on inline + * elements for indicator placement. + * + * When `sourceMatchMap` is provided, the sourceMatch for each citation is used as + * the clickable display label. The function searches backward in the text + * before [N] for the sourceMatch (case-insensitive) and wraps only that + * occurrence. This produces short, scannable inline citations that match + * the evidence highlight. + * + * Without `sourceMatchMap`, falls back to wrapping the last clause before [N]. + */ +export function wrapCitationMarkers(html: string, sourceMatchMap?: CitationSourceMatchMap): string { + const segments = splitHtmlPreservingTags(html); + return segments + .map(segment => { + return segment.startsWith("<") && segment.endsWith(">") + ? segment + : wrapCitationMarkerTextSegment(segment, sourceMatchMap); + }) + .join(""); } // ── Block-level parsing ──────────────────────────────────────────── @@ -179,8 +201,67 @@ interface Block { language?: string; // for code blocks } +function splitHtmlPreservingTags(html: string): string[] { + const segments: string[] = []; + let buffer = ""; + + for (let i = 0; i < html.length; i++) { + const ch = html[i] as string; + if (ch !== "<") { + buffer += ch; + continue; + } + + if (buffer) { + segments.push(buffer); + buffer = ""; + } + + const start = i; + let inSingleQuote = false; + let inDoubleQuote = false; + let closed = false; + + for (i = i + 1; i < html.length; i++) { + const tagChar = html[i] as string; + if (tagChar === "'" && !inDoubleQuote) { + inSingleQuote = !inSingleQuote; + } else if (tagChar === '"' && !inSingleQuote) { + inDoubleQuote = !inDoubleQuote; + } else if (tagChar === ">" && !inSingleQuote && !inDoubleQuote) { + segments.push(html.slice(start, i + 1)); + closed = true; + break; + } + } + + if (!closed) { + buffer += html.slice(start); + break; + } + } + + if (buffer) { + segments.push(buffer); + } + + return segments; +} + +function parseAtxHeading(line: string): { level: number; content: string } | null { + const match = line.match(/^ {0,3}(#{1,6})\s+(.+)$/); + if (!match) { + return null; + } + + return { + level: match[1].length, + content: match[2].trimEnd(), + }; +} + function parseBlocks(markdown: string): Block[] { - const lines = markdown.split("\n"); + const lines = markdown.replace(/\r\n?/g, "\n").split("\n"); const blocks: Block[] = []; let i = 0; @@ -212,12 +293,12 @@ function parseBlocks(markdown: string): Block[] { } // Heading - const headingMatch = line.match(/^(#{1,6})\s+(.+)$/); - if (headingMatch) { + const heading = parseAtxHeading(line); + if (heading) { blocks.push({ type: "heading", - level: headingMatch[1].length, - content: headingMatch[2], + level: heading.level, + content: heading.content, }); i++; continue; @@ -275,7 +356,7 @@ function parseBlocks(markdown: string): Block[] { while ( i < lines.length && lines[i].trim() !== "" && - !/^#{1,6}\s/.test(lines[i]) && + !parseAtxHeading(lines[i]) && !/^(-{3,}|\*{3,}|_{3,})\s*$/.test(lines[i]) && !lines[i].trim().startsWith("```") && !/^\s*[-*+]\s+/.test(lines[i]) && @@ -530,7 +611,6 @@ function reportShell(title: string, bodyHtml: string, options: MarkdownToHtmlOpt const claimText = options.claim?.trim(); const claimCard = claimText ? `
-CLAIM
${inlineFormat(claimText)}
` : ""; @@ -554,21 +634,9 @@ ${REVIEW_SHARED_BASE_CSS} -webkit-font-smoothing: antialiased; max-width: 900px; margin: 0 auto; - padding: 3rem 1.5rem 4rem 6.5rem; - counter-reset: h2section; - } - body > header { margin-bottom: 2rem; position: relative; } - body > header::before { - content: "00"; - position: absolute; - left: -5rem; - top: 0.4rem; - font-family: var(--dc-font-family-mono); - font-size: 12px; - font-weight: 500; - color: var(--dc-border); - letter-spacing: 0.05em; + padding: 3rem 1.5rem 4rem; } + body > header { margin-bottom: 2rem; } body > header h1 { font-size: 30px; font-weight: 600; @@ -593,58 +661,19 @@ ${REVIEW_SHARED_BASE_CSS} .dc-meta-link { color: var(--dc-primary); text-decoration: none; font-weight: 500; } .dc-meta-link:hover { text-decoration: underline; } [data-cite] strong { font-weight: 600; } - .dc-verdict { - display: flex; - gap: 1.5rem; - padding: 0.85rem 1rem; - margin-bottom: 2.25rem; - font-family: var(--dc-font-family-mono); - font-size: 12px; - border: 1px solid var(--dc-border); - background: var(--dc-muted); - } - .dc-verdict .v-found { color: var(--dc-verified); } - .dc-verdict .v-partial { color: var(--dc-partial); } - .dc-verdict .v-miss { color: var(--dc-destructive); } h1 { font-size: 30px; font-weight: 600; letter-spacing: -0.02em; } h2 { - counter-increment: h2section; - counter-reset: h3section; font-size: 20px; font-weight: 600; margin: 2.75rem 0 0.85rem; padding-bottom: 0.5rem; border-bottom: 1px solid var(--dc-border); letter-spacing: -0.01em; - position: relative; - } - h2::before { - content: counter(h2section, decimal-leading-zero); - position: absolute; - left: -5rem; - top: 0.35rem; - font-family: var(--dc-font-family-mono); - font-size: 12px; - font-weight: 500; - color: var(--dc-primary); - letter-spacing: 0.05em; } h3 { - counter-increment: h3section; font-size: 16px; font-weight: 600; margin: 1.75rem 0 0.5rem; - position: relative; - } - h3::before { - content: counter(h2section, decimal-leading-zero) "." counter(h3section); - position: absolute; - left: -5rem; - top: 0.2rem; - font-family: var(--dc-font-family-mono); - font-size: 11px; - font-weight: 500; - color: var(--dc-subtle-foreground); } .dc-section { background: var(--dc-background); border: 1px solid var(--dc-border); padding: 1.25rem 1.5rem; margin: 1rem 0; } .mono { font-family: var(--dc-font-family-mono); font-size: 14px; font-weight: 500; } @@ -707,7 +736,6 @@ ${REVIEW_SHARED_BASE_CSS} .dc-claim-text em { font-style: italic; } @media (max-width: 720px) { body { padding: 2rem 1.25rem 3rem; } - body > header::before, h2::before, h3::before { position: static; display: block; margin-bottom: 0.2rem; } .dc-footer { margin-left: 0; padding-left: 0; } } @media print { @@ -729,7 +757,6 @@ ${
` : "" } -
${bodyHtml}