From f58a1ba7a46a6ad3a076cabfc18af91d02758846 Mon Sep 17 00:00:00 2001 From: Benson Date: Thu, 16 Apr 2026 09:49:15 -0600 Subject: [PATCH 01/13] Restore report verdict shell --- src/cli/markdownToHtml.ts | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/cli/markdownToHtml.ts b/src/cli/markdownToHtml.ts index 96aa409d..f3bbd28e 100644 --- a/src/cli/markdownToHtml.ts +++ b/src/cli/markdownToHtml.ts @@ -782,9 +782,6 @@ const REVIEW_SHARED_BASE_CSS = ` * { margin: 0; padding: 0; box-sizing: border- summary:hover { color: var(--dc-foreground); } .dc-section { background: var(--dc-background); border: 1px solid var(--dc-border); padding: 1.25rem 1.5rem; margin: 1rem 0; } .mono { font-family: var(--dc-font-family-mono); font-size: 14px; font-weight: 500; } - .dc-verdict .v-found { color: var(--dc-verified); } - .dc-verdict .v-partial { color: var(--dc-partial); } - .dc-verdict .v-miss { color: var(--dc-destructive); } .dc-meta-sep { display: none; } .dc-cowork-notice svg { flex-shrink: 0; margin-top: 2px; }`; From beb628d79a7652bd8387d7a09d54d43d5e5ea26e Mon Sep 17 00:00:00 2001 From: Benson Date: Thu, 16 Apr 2026 09:49:31 -0600 Subject: [PATCH 02/13] Update prepare CLI output contract --- src/__tests__/cliCommands.test.ts | 164 ++++++++++++++++++++++++++---- src/__tests__/cliText.test.ts | 2 +- src/cli/commands.ts | 145 ++++++++++++-------------- src/cli/hydrate.ts | 10 +- src/cli/slice.ts | 4 +- src/cli/text.ts | 4 +- src/cli/textRender.ts | 4 +- 7 files changed, 222 insertions(+), 111 deletions(-) diff --git a/src/__tests__/cliCommands.test.ts b/src/__tests__/cliCommands.test.ts index a3eae9cb..c2985385 100644 --- a/src/__tests__/cliCommands.test.ts +++ b/src/__tests__/cliCommands.test.ts @@ -7,7 +7,7 @@ */ import { randomBytes } from "node:crypto"; -import { mkdirSync, readFileSync, writeFileSync } from "node:fs"; +import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs"; import { tmpdir } from "node:os"; import { join, resolve } from "node:path"; @@ -567,7 +567,7 @@ describe("prepare command", () => { ); }); - it("prepares a URL", async () => { + it("prints prepared URL JSON to stdout by default", async () => { const tmpDir = makeTmpDir(); const origCwd = process.cwd(); process.chdir(tmpDir); @@ -575,14 +575,20 @@ describe("prepare command", () => { try { mockPrepareUrl.mockResolvedValue({ attachmentId: "att-123", - deepTextPages: ["some text"], + deepTextPages: [`some text`], metadata: { pageCount: 1, textByteSize: 1024 }, }); const { stdout } = await captureOutput(() => prepare(["https://example.com/article"], fmtNetErr)); expect(mockPrepareUrl).toHaveBeenCalledWith(expect.objectContaining({ url: "https://example.com/article" })); - expect(stdout).toContain(".deepcitation/prepare-example.com.json"); + const output = JSON.parse(stdout); + expect(output).toEqual({ + attachmentId: "att-123", + metadata: { pageCount: 1, textByteSize: 1024 }, + deepTextPages: [`some text`], + }); + expect(existsSync(join(tmpDir, ".deepcitation"))).toBe(false); } finally { process.chdir(origCwd); } @@ -608,7 +614,7 @@ describe("prepare command", () => { } }); - it("prints text JSON when --text is used", async () => { + it("writes prepared JSON only when --out is provided", async () => { const tmpDir = makeTmpDir(); const origCwd = process.cwd(); process.chdir(tmpDir); @@ -616,21 +622,28 @@ describe("prepare command", () => { try { mockPrepareUrl.mockResolvedValue({ attachmentId: "att-123", - deepTextPages: ["deep text here"], + deepTextPages: [`deep text here`], metadata: { pageCount: 2, textByteSize: 2048 }, }); - const { stdout } = await captureOutput(() => prepare(["https://example.com/article", "--text"], fmtNetErr)); + const outPath = join(tmpDir, ".deepcitation", "prepare-example.json"); + const { stdout } = await captureOutput(() => + prepare(["https://example.com/article", "--out", outPath], fmtNetErr), + ); - const summary = JSON.parse(stdout); - expect(summary.attachmentId).toBe("att-123"); - expect(summary.deepTextPages).toEqual(["deep text here"]); + expect(stdout).toBe(outPath); + const output = JSON.parse(readFileSync(outPath, "utf-8")); + expect(output.attachmentId).toBe("att-123"); + expect(output.metadata).toEqual({ pageCount: 2, textByteSize: 2048 }); + expect(output.deepTextPages).toEqual([ + `deep text here`, + ]); } finally { process.chdir(origCwd); } }); - it("--text strips and metadata from deepTextPages", async () => { + it("prints tagged text when --format txt is used", async () => { const tmpDir = makeTmpDir(); const origCwd = process.cwd(); process.chdir(tmpDir); @@ -645,16 +658,123 @@ describe("prepare command", () => { metadata: { pageCount: 2, textByteSize: 512 }, }); - const { stdout } = await captureOutput(() => prepare(["https://example.com/doc", "--text"], fmtNetErr)); + const { stdout } = await captureOutput(() => prepare(["https://example.com/doc", "--format", "txt"], fmtNetErr)); + + expect(stdout).toContain(""); + expect(stdout).toContain('Hello world.'); + expect(stdout).toContain(""); + expect(stdout).toContain(""); + expect(stdout).toContain('Page two content.'); + expect(stdout).toContain(""); + } finally { + process.chdir(origCwd); + } + }); + + it("prints cleaned prose when --format plain is used", async () => { + const tmpDir = makeTmpDir(); + const origCwd = process.cwd(); + process.chdir(tmpDir); + + try { + mockPrepareUrl.mockResolvedValue({ + attachmentId: "att-tagged", + deepTextPages: [ + `Hello world.Second line.`, + `Page two content.`, + ], + metadata: { pageCount: 2, textByteSize: 512 }, + }); + + const { stdout } = await captureOutput(() => + prepare(["https://example.com/doc", "--format", "plain"], fmtNetErr), + ); + + expect(stdout).not.toContain(" { + const tmpDir = makeTmpDir(); + const origCwd = process.cwd(); + process.chdir(tmpDir); + + try { + mockPrepareUrl.mockResolvedValue({ + attachmentId: "att-tagged", + deepTextPages: [`Hello world.`], + metadata: { pageCount: 1, textByteSize: 128 }, + }); + + const outPath = join(tmpDir, "evidence.txt"); + const { stdout } = await captureOutput(() => + prepare(["https://example.com/doc", "--format", "txt", "--out", outPath], fmtNetErr), + ); + + expect(stdout).toBe(outPath); + expect(readFileSync(outPath, "utf-8")).toBe( + `Hello world.`, + ); + } finally { + process.chdir(origCwd); + } + }); + + it("preserves page wrappers when --pages selects a subset", async () => { + const tmpDir = makeTmpDir(); + const origCwd = process.cwd(); + process.chdir(tmpDir); + + try { + mockPrepareUrl.mockResolvedValue({ + attachmentId: "att-pages", + deepTextPages: [ + `Page one.`, + `Page two.`, + ], + metadata: { pageCount: 2, textByteSize: 256 }, + }); + + const { stdout } = await captureOutput(() => prepare(["https://example.com/doc", "--pages", "2"], fmtNetErr)); + const output = JSON.parse(stdout); + + expect(output.deepTextPages).toEqual([ + `Page two.`, + ]); + } finally { + process.chdir(origCwd); + } + }); + + it.each(["--text", "--txt", "--summary"])("rejects removed prepare flag %s", async removedFlag => { + await expect(captureOutput(() => prepare(["https://example.com/doc", removedFlag], fmtNetErr))).rejects.toThrow( + "process.exit(1)", + ); + }); + + it("accepts --out before the source argument", async () => { + const tmpDir = makeTmpDir(); + const origCwd = process.cwd(); + process.chdir(tmpDir); + + try { + mockPrepareUrl.mockResolvedValue({ + attachmentId: "att-ordered", + deepTextPages: [`Ordered.`], + metadata: { pageCount: 1, textByteSize: 64 }, + }); + + const outPath = join(tmpDir, "ordered.json"); + await captureOutput(() => prepare(["--out", outPath, "https://example.com/doc"], fmtNetErr)); - const summary = JSON.parse(stdout); - expect(summary.deepTextPages[0]).not.toContain(" { const { stdout } = await captureOutput(() => prepare([filePath], fmtNetErr)); expect(mockUploadFile).toHaveBeenCalled(); - expect(stdout).toContain("prepare-report.json"); + const output = JSON.parse(stdout); + expect(output.attachmentId).toBe("att-456"); + expect(output.deepTextPages).toEqual(["text"]); } finally { process.chdir(origCwd); } diff --git a/src/__tests__/cliText.test.ts b/src/__tests__/cliText.test.ts index 84013054..d61f0e18 100644 --- a/src/__tests__/cliText.test.ts +++ b/src/__tests__/cliText.test.ts @@ -118,7 +118,7 @@ describe("text", () => { expect(code).toBe(1); }); - it("rejects --format json (use prepare --text)", () => { + it("rejects --format json (use prepare default JSON output)", () => { const prep = writePrepareFile(tmp, 1); const code = runAndCatchExit([prep, "-f", "json"]); expect(code).toBe(1); diff --git a/src/cli/commands.ts b/src/cli/commands.ts index 52b42182..e26014f5 100644 --- a/src/cli/commands.ts +++ b/src/cli/commands.ts @@ -44,7 +44,6 @@ import { sanitizeForLog } from "../utils/logSafety.js"; import { normalizeCitationsFile } from "../utils/normalizeCitations.js"; import { detectProxyUrl } from "../utils/proxy.js"; import { safeExec, safeReplace, safeTest } from "../utils/regexSafety.js"; -import { cleanDeepTextPage } from "../utils/textCleanup.js"; import { validateCitationData } from "../utils/validateCitationData.js"; import { CDN_JS } from "../vanilla/_generated_cdn.js"; import { @@ -60,7 +59,6 @@ import { die, extractApiKey, isValidApiKeyFormat, normalizeShortFlags, parseArgs import { findSummaryForMarkdown, hydrateCitations, parseSummaryToLineMap } from "./hydrate.js"; import { generateReviewVariants, markdownToHtml, type ReportStyle } from "./markdownToHtml.js"; import { createCoworkFetch, createProxyFetch } from "./proxy.js"; -import type { TextFormat } from "./textRender.js"; import { applyLineIds, parseFormatMode, parseLineIdsMode, renderTextStream, resolvePageSpec } from "./textRender.js"; // Re-export so cli.ts and tests can import from the single commands module @@ -118,23 +116,17 @@ Examples: export const PREPARE_HELP = `Usage: deepcitation prepare [options] Prepare a file or URL for citation verification. Uploads the source to the -DeepCitation API and saves the response JSON (attachmentId + deepTextPages). +DeepCitation API and prints the prepared output to stdout by default. Arguments: Local file path or URL to prepare Options: - --out, -o Output path (default: .deepcitation/prepare-{name}.json - for JSON mode, .deepcitation/{name}.txt for --txt mode) - --text Print cleaned {attachmentId, deepTextPages} JSON to stdout. - Backward-compatible default: strips / - tags unless --line-ids is also passed. - --txt Write tagged text to .deepcitation/{name}.txt (LLM default). - Equivalent to --format txt with default line-id sampling. + --out, -o Write output to file instead of stdout --format, -f Output format override: "json" | "txt" | "plain" - - json: {attachmentId, deepTextPages} JSON + - json: {attachmentId, metadata, deepTextPages} JSON - txt: raw deepTextPages with and - tags (what citation authoring wants) + tags - plain: page text with all tags stripped, pages joined by "\\n\\n" --line-ids, -l Line-ID tag sampling: "default" | "none" | "every=N" - default: every-5 + first/last (server default) @@ -150,11 +142,11 @@ Options: Examples: deepcitation prepare report.pdf - deepcitation prepare report.pdf --txt # LLM default: tagged text to .txt file - deepcitation prepare report.pdf --txt -p 1-10 # only first 10 pages - deepcitation prepare report.pdf --text # back-compat: cleaned JSON to stdout - deepcitation prepare report.pdf --text -f txt # stdout: tagged text instead of JSON - deepcitation prepare https://example.com/article --txt + deepcitation prepare report.pdf --out .deepcitation/prepare-report.json + deepcitation prepare report.pdf --format txt # prompt-ready tagged text to stdout + deepcitation prepare report.pdf --format txt -p 1-10 # only first 10 pages + deepcitation prepare report.pdf --format plain + deepcitation prepare https://example.com/article --format txt `; export const VERIFY_HELP = `Usage: deepcitation verify [options] @@ -253,6 +245,36 @@ const ALLOWED_INDICATORS = ["icon", "dot", "none"] as const; // ── helpers ─────────────────────────────────────────────────────── const DEFAULT_API_URL = "https://api.deepcitation.com"; +const PREPARE_VALUE_FLAGS = new Set(["--out", "--format", "--line-ids", "--pages"]); + +function findPrepareSource(argv: string[]): string | undefined { + for (let i = 0; i < argv.length; i++) { + const arg = argv[i]; + if (!arg) continue; + if (PREPARE_VALUE_FLAGS.has(arg)) { + i++; + continue; + } + if (!arg.startsWith("-")) return arg; + } + return undefined; +} + +function prepareMigrationError(flag: string): never { + die( + `${flag} is no longer supported by prepare. ` + + `Use the default JSON output, --format txt, --format plain, and/or --out .`, + PREPARE_HELP, + ); +} + +function writePrepareFile(outPath: string, body: string): void { + const parent = dirname(outPath); + if (!existsSync(parent)) mkdirSync(parent, { recursive: true }); + writeFileSync(outPath, body); // lgtm[js/http-to-file-access] + console.error(` Saved: ${outPath}`); + console.log(outPath); +} export function canStartBrowserAuth(argv: string[] = []): boolean { // --browser is an explicit opt-in that starts the OAuth flow even in constrained @@ -438,26 +460,22 @@ export async function prepare(argv: string[], _fmtNetErr: (err: unknown) => stri // Extract boolean flags before parseArgs (which only handles --key value pairs) const unsafeFast = normalized.includes("--unsafe-fast"); - const textFlag = normalized.includes("--text") || normalized.includes("--summary"); - const txtFlag = normalized.includes("--txt"); const skipCache = normalized.includes("--skip-cache"); - const booleans = new Set(["--unsafe-fast", "--text", "--summary", "--skip-cache", "--txt"]); + for (const removedFlag of ["--text", "--txt", "--summary"]) { + if (normalized.includes(removedFlag)) prepareMigrationError(removedFlag); + } + const booleans = new Set(["--unsafe-fast", "--skip-cache"]); const filteredArgv = normalized.filter(a => !booleans.has(a)); const args = parseArgs(filteredArgv, PREPARE_HELP); - // Positional argument: first non-flag token. - const positional = filteredArgv.find(a => !a.startsWith("--")); + // Positional argument: first non-flag token that is not a flag value. + const positional = findPrepareSource(filteredArgv); if (!positional) die("A file path or URL is required", PREPARE_HELP); // Validate format + line-id flags up-front so bad input fails before the API call. const lineIdsMode = parseLineIdsMode(args["line-ids"], PREPARE_HELP); - const fallbackFormat: TextFormat = txtFlag ? "txt" : "json"; - const format = parseFormatMode(args.format, fallbackFormat, PREPARE_HELP); - - if (txtFlag && format === "json") { - die("--txt conflicts with --format json; drop --txt or pass --format txt/plain", PREPARE_HELP); - } + const format = parseFormatMode(args.format, "json", PREPARE_HELP); const { apiKey } = await requireAuth(); const dc = await createClient(apiKey); @@ -468,16 +486,13 @@ export async function prepare(argv: string[], _fmtNetErr: (err: unknown) => stri } let result; - let label: string; if (isUrl) { - label = new URL(positional).hostname.replace(/^www\./, ""); console.error(unsafeFast ? `Preparing URL (fast mode)...` : `Preparing URL (this may take ~30s)...`); result = await dc.prepareUrl({ url: positional, unsafeFastUrlOutput: unsafeFast, skipCache }); } else { const filePath = resolve(positional); if (!existsSync(filePath)) die(`File not found: ${positional}`, PREPARE_HELP); - label = basename(filePath).replace(/\.[^.]+$/, ""); console.error(`Preparing file: ${basename(filePath)}...`); const buffer = readFileSync(filePath); result = await dc.uploadFile(buffer, { filename: basename(filePath) }); @@ -486,61 +501,35 @@ export async function prepare(argv: string[], _fmtNetErr: (err: unknown) => stri const pickedIndices = resolvePageSpec(args.pages, result.deepTextPages.length, PREPARE_HELP); const selectedPages = pickedIndices.map(i => result.deepTextPages[i] as string); - const outDir = resolve(".deepcitation"); - if (!existsSync(outDir)) mkdirSync(outDir, { recursive: true }); - - // --txt mode: write tagged text to a .txt file (LLM default). - if (txtFlag) { - const txtPath = resolve(args.out ?? `.deepcitation/${label}.txt`); - const body = renderTextStream(selectedPages, format === "json" ? "txt" : format, lineIdsMode); - writeFileSync(txtPath, body); // lgtm[js/http-to-file-access] - console.error(` Attachment ID: ${sanitizeForLog(result.attachmentId)}`); - console.error( - ` Pages: ${pickedIndices.length}${pickedIndices.length !== result.metadata.pageCount ? ` / ${result.metadata.pageCount}` : ""}`, - ); - console.error(` Text: ${Math.round(result.metadata.textByteSize / 1024)}KB`); - if (result.processingTimeMs) { - console.error(` Time: ${(result.processingTimeMs / 1000).toFixed(1)}s`); - } - console.error(` Saved: ${txtPath}`); - console.log(txtPath); - return; - } - - // Default path: write the full prepare response as JSON to disk. - const outPath = resolve(args.out ?? `.deepcitation/prepare-${label}.json`); - writeFileSync(outPath, JSON.stringify(result, null, 2)); // lgtm[js/http-to-file-access] + const body = + format === "json" + ? JSON.stringify( + { + attachmentId: result.attachmentId, + metadata: result.metadata, + deepTextPages: selectedPages.map(page => applyLineIds(page, lineIdsMode)), + }, + null, + 2, + ) + : renderTextStream(selectedPages, format, lineIdsMode); console.error(` Attachment ID: ${sanitizeForLog(result.attachmentId)}`); - console.error(` Pages: ${result.metadata.pageCount}`); + console.error( + ` Pages: ${pickedIndices.length}${pickedIndices.length !== result.metadata.pageCount ? ` / ${result.metadata.pageCount}` : ""}`, + ); console.error(` Text: ${Math.round(result.metadata.textByteSize / 1024)}KB`); if (result.processingTimeMs) { console.error(` Time: ${(result.processingTimeMs / 1000).toFixed(1)}s`); } - console.error(` Saved: ${outPath}`); - if (textFlag) { - if (format === "txt" || format === "plain") { - // Stream tagged or plain text to stdout instead of JSON. - process.stdout.write(renderTextStream(selectedPages, format, lineIdsMode)); - process.stdout.write("\n"); - return; - } - // json format — back-compat path. When no --line-ids flag is passed, strip tags - // (current behavior). When --line-ids is explicit, honor it. - const pagesForJson = - args["line-ids"] === undefined - ? selectedPages.map(cleanDeepTextPage) - : selectedPages.map(p => applyLineIds(p, lineIdsMode)); - console.log( - JSON.stringify({ - attachmentId: result.attachmentId, - deepTextPages: pagesForJson, - }), - ); - } else { - console.log(outPath); + if (args.out) { + writePrepareFile(resolve(args.out), body); + return; } + + process.stdout.write(body); + if (!body.endsWith("\n")) process.stdout.write("\n"); } export async function verify( diff --git a/src/cli/hydrate.ts b/src/cli/hydrate.ts index 4e20a6da..a359a07d 100644 --- a/src/cli/hydrate.ts +++ b/src/cli/hydrate.ts @@ -32,13 +32,13 @@ Use this when the draft was generated with the compact citation format Options: --markdown Path to draft markdown file with <<>> block - --summary Path to summary file from "deepcitation prepare --text" + --summary Path to JSON summary file from "deepcitation prepare --out" --out Output path (default: overwrites --markdown input) -h, --help Show this help message Examples: - deepcitation hydrate --markdown .deepcitation/draft.md --summary .deepcitation/summary-report.txt - deepcitation hydrate --markdown .deepcitation/draft.md --summary .deepcitation/summary-report.txt --out .deepcitation/draft-hydrated.md + deepcitation hydrate --markdown .deepcitation/draft.md --summary .deepcitation/prepare-report.json + deepcitation hydrate --markdown .deepcitation/draft.md --summary .deepcitation/prepare-report.json --out .deepcitation/draft-hydrated.md `; /** @@ -63,7 +63,7 @@ export interface LineMap { } export interface HydrateOptions { - /** Raw content of the summary file (JSON string from deepcitation prepare --text) */ + /** Raw content of the summary file (JSON string from deepcitation prepare --out) */ summaryContent: string; /** Citations to hydrate in place — source_context is mutated on matching entries */ citations: CitationData[]; @@ -399,7 +399,7 @@ export function hydrateCitations({ summaryContent, citations, warnOnMiss }: Hydr * * Search order (most reliable first): * 1. `.deepcitation/prepare-*.json` — pure JSON output from `deepcitation prepare` - * 2. `.deepcitation/summary-*.txt` — text+JSON output from `prepare --text` + * 2. `.deepcitation/summary-*.txt` — legacy text+JSON output from `prepare --text` * * When `attachmentId` is provided, scans each candidate and returns the first one * whose JSON contains a matching `attachmentId`. This prevents the wrong evidence diff --git a/src/cli/slice.ts b/src/cli/slice.ts index 156d0bb8..a88e952b 100644 --- a/src/cli/slice.ts +++ b/src/cli/slice.ts @@ -25,7 +25,7 @@ chunks to and prints a JSON manifest to stdout. Arguments: Path to the prepare-.json written by - \`deepcitation prepare \`. + \`deepcitation prepare --out \`. Options: -n, --parts Number of chunks to produce (default: 2) @@ -127,7 +127,7 @@ export function slice(argv: string[]): void { const outDir = resolve(args.out ?? ".deepcitation"); const format = parseFormatMode(args.format, "txt", SLICE_HELP); if (format === "json") { - die("--format json is not supported by slice (use prepare --text instead)", SLICE_HELP); + die("--format json is not supported by slice (use prepare default JSON output instead)", SLICE_HELP); } const lineIdsMode = parseLineIdsMode(args["line-ids"], SLICE_HELP); diff --git a/src/cli/text.ts b/src/cli/text.ts index 49dc0d7c..d79ec44b 100644 --- a/src/cli/text.ts +++ b/src/cli/text.ts @@ -21,7 +21,7 @@ no network, no auth, no re-upload. Emits to stdout by default. Arguments: Path to the prepare-.json written by - \`deepcitation prepare \`. + \`deepcitation prepare --out \`. Options: -p, --pages Page spec: "1-5,10" | "first=10" | "last=10" | "all" @@ -48,7 +48,7 @@ export function text(argv: string[]): void { const format = parseFormatMode(args.format, "txt", TEXT_HELP); if (format === "json") { - die("--format json is not supported by text (use prepare --text instead)", TEXT_HELP); + die("--format json is not supported by text (use prepare default JSON output instead)", TEXT_HELP); } const lineIdsMode = parseLineIdsMode(args["line-ids"], TEXT_HELP); diff --git a/src/cli/textRender.ts b/src/cli/textRender.ts index fecb7be0..e2fa7079 100644 --- a/src/cli/textRender.ts +++ b/src/cli/textRender.ts @@ -1,5 +1,5 @@ /** - * Deterministic text rendering primitives shared by `prepare --txt`, + * Deterministic text rendering primitives shared by `prepare`, * `slice`, and `text` subcommands. * * Everything here is a pure function over strings — no filesystem, no network, @@ -193,7 +193,7 @@ export function applyLineIds(page: string, mode: LineIdsMode): string { * * - `txt` preserves `` wrappers and `` tags (subject * to `lineIds` mode), joining pages with `\n`. This is the LLM-default - * output for `prepare --txt` and `slice`. + * output for `prepare`, `text`, and `slice`. * - `plain` strips both page wrappers and line tags via `cleanDeepTextPage`, * then joins pages with a blank-line separator so prose stays readable. */ From de8b99456a5d76751616866dc53f51b6701c0238 Mon Sep 17 00:00:00 2001 From: Benson Date: Thu, 16 Apr 2026 10:15:08 -0600 Subject: [PATCH 03/13] Harden citation block parsing --- src/__tests__/citationParser.test.ts | 23 ++++++++++++------- src/__tests__/cite.test.ts | 16 +++++++++++++ src/cli/cite.ts | 34 ++++++++++++++++------------ src/cli/commands.ts | 19 ++++++++++++---- src/parsing/citationParser.ts | 33 ++++++++++++++------------- 5 files changed, 82 insertions(+), 43 deletions(-) diff --git a/src/__tests__/citationParser.test.ts b/src/__tests__/citationParser.test.ts index 5a4a3676..66a15876 100644 --- a/src/__tests__/citationParser.test.ts +++ b/src/__tests__/citationParser.test.ts @@ -162,6 +162,19 @@ ${CITATION_DATA_START_DELIMITER} expect(result.citations.length).toBe(1); }); + it("handles malformed end delimiter variant", () => { + const response = `Test [1]. + +${CITATION_DATA_START_DELIMITER} +[{"id": 1, "attachment_id": "a", "source_context": "test", "source_match": "test"}] +<<>>`; + + const result = parseCitationData(response); + + expect(result.success).toBe(true); + expect(result.citations.length).toBe(1); + }); + it("handles empty citation block", () => { const response = `No citations here. @@ -175,19 +188,13 @@ ${CITATION_DATA_END_DELIMITER}`; expect(result.citations.length).toBe(0); }); - it("fails when citation block is present but whitespace-only", () => { - // A bare <<>> block with no content is an authoring mistake - // (e.g. unfilled template placeholder), not a legitimate empty result. - // An empty array `[]` is fine; whitespace-only is a failure. - // See plans/noble-skipping-wolf.md for the parallel-agent merge failure this caught. + it("treats whitespace-only citation blocks as recoverable empties", () => { const response = `Body text.\n\n${CITATION_DATA_START_DELIMITER}\n\n${CITATION_DATA_END_DELIMITER}\n`; const result = parseCitationData(response); - expect(result.success).toBe(false); - expect(result.error).toMatch(/empty/i); + expect(result.success).toBe(true); expect(result.citations.length).toBe(0); - // visibleText is still extracted even on failure expect(result.visibleText).toBe("Body text."); }); diff --git a/src/__tests__/cite.test.ts b/src/__tests__/cite.test.ts index 707a7de3..80906f4a 100644 --- a/src/__tests__/cite.test.ts +++ b/src/__tests__/cite.test.ts @@ -39,6 +39,14 @@ describe("extractMarkersFromBody", () => { expect(markers[0].claimText).toBe("Discount Rate"); }); + it("preserves alternate labels for a reused cite id", () => { + const body = "[Horizontal Boundaries](cite:1) and [Vertical Boundaries](cite:1)"; + const markers = extractMarkersFromBody(body); + expect(markers).toHaveLength(1); + expect(markers[0].claimText).toBe("Horizontal Boundaries"); + expect(markers[0].claimTextVariants).toEqual(["Vertical Boundaries"]); + }); + it("extracts double-quoted anchor hint", () => { const body = '[terminates](cite:3 "automatically terminate")'; const markers = extractMarkersFromBody(body); @@ -108,6 +116,14 @@ describe("extractMarkersFromBody", () => { expect(markers).toHaveLength(1); expect(markers[0].claimText).toBe("Rate"); }); + + it("preserves alternate labels for reused bold markers", () => { + const body = "**Horizontal Boundaries** [1] and **Vertical Boundaries** [1]"; + const markers = extractMarkersFromBody(body); + expect(markers).toHaveLength(1); + expect(markers[0].claimText).toBe("Horizontal Boundaries"); + expect(markers[0].claimTextVariants).toEqual(["Vertical Boundaries"]); + }); }); // ── getAllLines ─────────────────────────────────────────────────── diff --git a/src/cli/cite.ts b/src/cli/cite.ts index 309e3fe6..85aa06b7 100644 --- a/src/cli/cite.ts +++ b/src/cli/cite.ts @@ -81,6 +81,8 @@ export function getAllLines(lineMap: LineMap): LineEntry[] { export interface BodyMarker { id: number; claimText: string; + /** Alternate labels that reused the same citation ID. */ + claimTextVariants?: string[]; /** Verbatim anchor text from the evidence, if provided via title syntax. */ anchorHint?: string; } @@ -102,26 +104,26 @@ export interface BodyMarker { export function extractMarkersFromBody(body: string): BodyMarker[] { // Match [label](cite:N ...) — capture everything inside the parens after cite:N const re = /\[([^\][]+)\]\(cite:(\d+)((?:\s+[^)]*)?)\)/g; - const seen = new Map(); // id → first display label - const results: BodyMarker[] = []; + const seen = new Map(); // id → first marker, with alternates preserved let m: RegExpExecArray | null; while ((m = safeExec(re, body)) !== null) { const label = m[1].trim(); const id = parseInt(m[2], 10); const rest = m[3]?.trim() ?? ""; - if (seen.has(id)) { - if (seen.get(id) !== label) { + const existing = seen.get(id); + if (existing) { + if (existing.claimText !== label && !(existing.claimTextVariants?.includes(label) ?? false)) { console.error( ` Warning: cite:${id} reused with different label — ` + - `"${sanitizeForLog(seen.get(id) ?? "")}" (used) vs "${sanitizeForLog(label)}" (ignored). ` + + `"${sanitizeForLog(existing.claimText)}" (used) vs "${sanitizeForLog(label)}" (ignored). ` + `Each distinct claim must use a unique ID.`, ); + existing.claimTextVariants ??= []; + existing.claimTextVariants.push(label); } continue; } - seen.set(id, label); - const marker: BodyMarker = { id, claimText: label }; // Parse optional anchor hint (single or double quoted) @@ -130,32 +132,34 @@ export function extractMarkersFromBody(body: string): BodyMarker[] { const anchorRaw = anchorDQ?.[1] ?? anchorSQ?.[1]; if (anchorRaw?.trim()) marker.anchorHint = anchorRaw.trim(); - results.push(marker); + seen.set(id, marker); } // Fallback: **bold text** [N] markers (Strategy 2c format). // Only used when no [text](cite:N) markers were found. - if (results.length === 0) { + if (seen.size === 0) { const boldRe = /\*\*([^*]+)\*\*\s*\[(\d+)\]/g; let bm: RegExpExecArray | null; while ((bm = safeExec(boldRe, body)) !== null) { const label = bm[1].trim(); const id = parseInt(bm[2], 10); - if (seen.has(id)) { - if (seen.get(id) !== label) { + const existing = seen.get(id); + if (existing) { + if (existing.claimText !== label && !(existing.claimTextVariants?.includes(label) ?? false)) { console.error( ` Warning: [${id}] reused with different label — ` + - `"${sanitizeForLog(seen.get(id) ?? "")}" (used) vs "${sanitizeForLog(label)}" (ignored). ` + + `"${sanitizeForLog(existing.claimText)}" (used) vs "${sanitizeForLog(label)}" (ignored). ` + `Each distinct claim must use a unique ID.`, ); + existing.claimTextVariants ??= []; + existing.claimTextVariants.push(label); } continue; } - seen.set(id, label); - results.push({ id, claimText: label }); + seen.set(id, { id, claimText: label }); } } - return results.sort((a, b) => a.id - b.id); + return [...seen.values()].sort((a, b) => a.id - b.id); } /** Generic words skipped by Strategy 3 to avoid wrong-context single-word matches. */ diff --git a/src/cli/commands.ts b/src/cli/commands.ts index e26014f5..cfb28f07 100644 --- a/src/cli/commands.ts +++ b/src/cli/commands.ts @@ -872,9 +872,19 @@ export async function verifyMarkdown(argv: string[], fmtNetErr: (err: unknown) = const allLines = getAllLines(lineMap); const citations: CitationData[] = []; - for (const { id, claimText, anchorHint } of markers) { - const searchTerm = anchorHint ?? claimText; - const found = findAnchorWithFallback(searchTerm, allLines); + for (const { id, claimText, claimTextVariants, anchorHint } of markers) { + const searchTerms = anchorHint + ? [anchorHint, claimText, ...(claimTextVariants ?? [])] + : [claimText, ...(claimTextVariants ?? [])]; + let found: ReturnType | null = null; + let usedSearchTerm: string | undefined; + for (const searchTerm of searchTerms) { + found = findAnchorWithFallback(searchTerm, allLines); + if (found) { + usedSearchTerm = searchTerm; + break; + } + } if (!found) { console.error(` Citation ${id} ("${claimText}"): not found in evidence`); continue; @@ -887,7 +897,8 @@ export async function verifyMarkdown(argv: string[], fmtNetErr: (err: unknown) = page_id: toCompactPageId(pageId), line_ids: [lineId], attachment_id: attachmentId, - claim_text: claimText.toLowerCase() !== sourceMatch.toLowerCase() ? claimText : undefined, + claim_text: + usedSearchTerm && usedSearchTerm.toLowerCase() !== sourceMatch.toLowerCase() ? usedSearchTerm : undefined, }); } diff --git a/src/parsing/citationParser.ts b/src/parsing/citationParser.ts index 13e9caf8..f75bbf50 100644 --- a/src/parsing/citationParser.ts +++ b/src/parsing/citationParser.ts @@ -85,6 +85,8 @@ const CITATION_MARKER_RE = /\[(\d+)\]/g; */ const CITATION_LINK_RE = /\[([^\][]+)\]\(cite:(\d+)\)/g; +const CITATION_DATA_END_DELIMITER_VARIANTS = [CITATION_DATA_END_DELIMITER, "<<>>"] as const; + /** * Type guard to validate that an object has the required CitationData structure. * Ensures at minimum the id field is present and is a number. @@ -317,8 +319,15 @@ export function parseCitationData(llmResponse: string): ParsedCitationResponse { // Extract visible text (everything before the delimiter) const visibleText = llmResponse.substring(0, startIndex).trim(); - // Find the end delimiter - const endIndex = llmResponse.indexOf(CITATION_DATA_END_DELIMITER, startIndex); + // Find the end delimiter. Accept a small set of malformed variants because + // LLMs occasionally emit the wrong closing token while still providing usable JSON. + let endIndex = -1; + for (const delimiter of CITATION_DATA_END_DELIMITER_VARIANTS) { + const idx = llmResponse.indexOf(delimiter, startIndex); + if (idx !== -1 && (endIndex === -1 || idx < endIndex)) { + endIndex = idx; + } + } // Extract the JSON block const jsonStartIndex = startIndex + CITATION_DATA_START_DELIMITER.length; @@ -332,25 +341,17 @@ export function parseCitationData(llmResponse: string): ParsedCitationResponse { // Empty jsonString can mean two things: // 1. No end delimiter and no content — the output was truncated right at the - // start delimiter (common token-limit cutoff). Treat as success with 0 citations. - // 2. End delimiter is present but block is empty — upstream mistake (unfilled - // template placeholder, etc.). Return failure. + // start delimiter (common token-limit cutoff). + // 2. End delimiter is present but block is empty — upstream mistake + // (unfilled template placeholder, etc.). + // In both cases, treat this as a recoverable no-citations result so callers + // can fall back to body markers instead of hard-failing on the block itself. if (!jsonString) { - if (endIndex === -1) { - // Truncated immediately after start delimiter - return { - visibleText, - citations: [], - citationMap: new Map(), - success: true, - }; - } return { visibleText, citations: [], citationMap: new Map(), - success: false, - error: "Empty <<>> block: no JSON content between delimiters", + success: true, }; } From e5b1e5b2ac3e98a02be6b539dfcdc7daae2b7b8a Mon Sep 17 00:00:00 2001 From: Benson Date: Thu, 16 Apr 2026 10:38:20 -0600 Subject: [PATCH 04/13] Harden markdown citation rendering --- src/__tests__/citationParser.test.ts | 24 +++++++++ src/__tests__/cliCommands.test.ts | 3 ++ src/__tests__/client.test.ts | 22 ++++++++ src/__tests__/markdownToHtml.test.ts | 11 ++-- src/cli/commands.ts | 15 +++--- src/cli/markdownToHtml.ts | 80 +++++++++++++++++----------- src/client/DeepCitation.ts | 47 +++++++++++----- src/client/types.ts | 5 ++ src/parsing/citationParser.ts | 60 +++++++++++++++++++++ 9 files changed, 214 insertions(+), 53 deletions(-) diff --git a/src/__tests__/citationParser.test.ts b/src/__tests__/citationParser.test.ts index 66a15876..3f283de9 100644 --- a/src/__tests__/citationParser.test.ts +++ b/src/__tests__/citationParser.test.ts @@ -117,6 +117,30 @@ ${CITATION_DATA_END_DELIMITER}`; expect(result.citations[0].source_context).toContain("Line one"); }); + it("repairs literal newlines inside JSON string values", () => { + const response = `Multi-line content [1]. + +${CITATION_DATA_START_DELIMITER} +[ + { + "id": 1, + "attachment_id": "doc789", + "source_context": "Line one +Line two +Line three", + "source_match": "Line two", + "page_id": "page_number_1_index_0" + } +] +${CITATION_DATA_END_DELIMITER}`; + + const result = parseCitationData(response); + + expect(result.success).toBe(true); + expect(result.citations.length).toBe(1); + expect(result.citations[0].source_context).toContain("Line one"); + }); + it("handles multiple citations in single sentence", () => { const response = `Revenue was $1B [1] with profit of $100M [2] in Q4 [3]. diff --git a/src/__tests__/cliCommands.test.ts b/src/__tests__/cliCommands.test.ts index c2985385..24bb5db2 100644 --- a/src/__tests__/cliCommands.test.ts +++ b/src/__tests__/cliCommands.test.ts @@ -903,6 +903,9 @@ describe("verify command (--citations mode)", () => { const { stderr } = await captureOutput(() => verify(["--citations", citPath], fmtNetErr)); expect(mockVerifyAttachment).toHaveBeenCalledTimes(2); + for (const call of mockVerifyAttachment.mock.calls) { + expect(call[2]).toMatchObject({ requestTimeoutMs: 5000 }); + } expect(stderr).toContain("3 citations across 2 attachment(s)"); } finally { process.chdir(origCwd); diff --git a/src/__tests__/client.test.ts b/src/__tests__/client.test.ts index f600c2af..c52400a4 100644 --- a/src/__tests__/client.test.ts +++ b/src/__tests__/client.test.ts @@ -1307,5 +1307,27 @@ describe("DeepCitation Client", () => { // Only one fetch attempt — aborted during the delay before the second attempt expect(mockFetch).toHaveBeenCalledTimes(1); }); + + it("does not retry when fetch rejects with AbortError", async () => { + const client = new DeepCitation({ apiKey: "sk-dc-test-key-00000001", maxRetries: 3 }); + + mockFetch.mockRejectedValueOnce(new DOMException("Aborted", "AbortError")); + + const blob = new Blob(["content"]); + await expect(client.uploadFile(blob, { filename: "test.pdf" })).rejects.toThrow("Aborted"); + + expect(mockFetch).toHaveBeenCalledTimes(1); + }); + + it("does not retry when fetch rejects with TimeoutError", async () => { + const client = new DeepCitation({ apiKey: "sk-dc-test-key-00000001", maxRetries: 3 }); + + mockFetch.mockRejectedValueOnce(new DOMException("Request timed out after 50ms", "TimeoutError")); + + const blob = new Blob(["content"]); + await expect(client.uploadFile(blob, { filename: "test.pdf" })).rejects.toThrow("timed out"); + + expect(mockFetch).toHaveBeenCalledTimes(1); + }); }); }); diff --git a/src/__tests__/markdownToHtml.test.ts b/src/__tests__/markdownToHtml.test.ts index a92fde6f..81d45a5d 100644 --- a/src/__tests__/markdownToHtml.test.ts +++ b/src/__tests__/markdownToHtml.test.ts @@ -44,12 +44,10 @@ describe("wrapCitationMarkers", () => { expect(result).toContain("revenue grew significantly"); }); it("emits empty span for punctuation-only anchors", () => { - // Schedule "C" produces an anchor of just `"` after the regex cuts at the quote const html = '

Schedule "C" [1]

'; const result = wrapCitationMarkers(html); expect(result).toContain('data-cite="1"'); - // The span should have no inner text content (empty anchor) - expect(result).toMatch(/<\/span>/); + expect(result).toContain('Schedule "C"'); }); }); @@ -110,6 +108,13 @@ describe("markdownToHtml block parsing", () => { expect(result).toContain("

"); }); + it("renders headings with CRLF line endings", () => { + const result = markdownToHtml("# Title\r\n\r\n## Section\r\n\r\n### Sub", { style: "plain" }); + expect(result).toContain("

"); + expect(result).toContain("

"); + expect(result).toContain("

"); + }); + it("renders paragraphs", () => { const result = markdownToHtml("Hello world.\n\nSecond paragraph.", { style: "plain" }); expect(result).toContain("

Hello world.

"); diff --git a/src/cli/commands.ts b/src/cli/commands.ts index cfb28f07..4dbb6fe7 100644 --- a/src/cli/commands.ts +++ b/src/cli/commands.ts @@ -178,7 +178,7 @@ Options: --out Output path (default: {stem}-verified.html in CWD) --output-dir Save HTML and verify-response.json to this directory with stable names --json, --keep-json Also write {stem}-verify-response.json next to the HTML (debug/publish) - --no-publish Skip the auto-upload to My Verifications. Default is to publish as private. + --local-only Skip the auto-upload to My Verifications. --vis, --visibility Published visibility: private | unlisted | public (default: private) --theme Popover color theme (default: "auto") --indicator Indicator variant: icon, dot, none (default: "icon") @@ -187,12 +187,12 @@ Options: -h, --help Show this help message Examples: - deepcitation verify --md .deepcitation/draft-report.md # auto-publishes as private - deepcitation verify --md report.md --claim "Did Q1 revenue exceed $4B?" --model "Claude Haiku 4.5" + deepcitation verify --md .deepcitation/draft-report.md + deepcitation verify --md report.md --claim "Did Q1 revenue exceed $4B?" deepcitation verify --md report.md --style plain deepcitation verify --md report.md --vis unlisted # shareable by link deepcitation verify --md report.md --vis public # (Portal session only) - deepcitation verify --md report.md --no-publish # local-only, don't upload + deepcitation verify --md report.md --local-only deepcitation verify --html report.html --out verified.html deepcitation verify --prompt deepcitation verify --citations .deepcitation/citations-keyed.json @@ -241,6 +241,7 @@ Examples: const ALLOWED_THEMES = ["auto", "light", "dark"] as const; const ALLOWED_INDICATORS = ["icon", "dot", "none"] as const; +const VERIFY_REQUEST_TIMEOUT_MS = 5000; // ── helpers ─────────────────────────────────────────────────────── @@ -620,7 +621,7 @@ export async function verify( // Cast: CLI reads citations from JSON files as Record>, // but verifyAttachment expects its own typed CitationMap. The shapes match at runtime. groupCitations as unknown as Parameters[1], - { outputImageFormat: imageFormat }, + { outputImageFormat: imageFormat, requestTimeoutMs: VERIFY_REQUEST_TIMEOUT_MS }, ); Object.assign(merged, result.verifications); // Preserve per-attachment assets (pageImages, originalDownload) so downstream @@ -1095,7 +1096,7 @@ export async function verifyHtml(argv: string[], _fmtNetErr: (err: unknown) => s // Boolean flags — filter out before parseArgs (which only handles --key value pairs). // --publish / --pub are no-op opt-ins kept for backwards-compat: auto-publish - // is now the default and only needs to be suppressed with --no-publish. + // is now the default and only needs to be suppressed with --local-only. const keepJson = normalized.includes("--json") || normalized.includes("--keep-json"); const booleanFlags = new Set(["--json", "--keep-json"]); const filteredArgv = normalized.filter(a => !booleanFlags.has(a)); @@ -1283,7 +1284,7 @@ export async function verifyHtml(argv: string[], _fmtNetErr: (err: unknown) => s attachmentId, // Cast: same as verify command — JSON-parsed citations → typed CitationMap groupCitations as unknown as Parameters[1], - { outputImageFormat: imageFormat }, + { outputImageFormat: imageFormat, requestTimeoutMs: VERIFY_REQUEST_TIMEOUT_MS }, ); Object.assign(merged, result.verifications); // Invariant: each attachmentId belongs to exactly one group, so result.attachments diff --git a/src/cli/markdownToHtml.ts b/src/cli/markdownToHtml.ts index f3bbd28e..731e7fe4 100644 --- a/src/cli/markdownToHtml.ts +++ b/src/cli/markdownToHtml.ts @@ -115,28 +115,25 @@ export interface CitationSourceMatchMap { [citationId: string]: string; } -/** - * Find [N] markers in HTML content and wrap the appropriate text fragment - * in a . The CDN runtime needs data-cite on inline - * elements for indicator placement. - * - * When `sourceMatchMap` is provided, the sourceMatch for each citation is used as - * the clickable display label. The function searches backward in the text - * before [N] for the sourceMatch (case-insensitive) and wraps only that - * occurrence. This produces short, scannable inline citations that match - * the evidence highlight. - * - * Without `sourceMatchMap`, falls back to wrapping the last clause before [N]. - */ -export function wrapCitationMarkers(html: string, sourceMatchMap?: CitationSourceMatchMap): string { - // Match [N] markers anywhere in text nodes. Excluding `<` and `>` keeps us from - // consuming HTML tag boundaries; excluding `"` keeps us out of quoted attribute values. - return html.replace(/([^<>"]*?)\s*\[(\d+)\]/g, (_match, textBefore: string, num: string) => { +function wrapCitationMarkerTextSegment(text: string, sourceMatchMap?: CitationSourceMatchMap): string { + let out = ""; + let cursor = 0; + const markerRe = /\[(\d+)\]/g; + let match: RegExpExecArray | null; + + while ((match = markerRe.exec(text)) !== null) { + const markerStart = match.index; + const markerEnd = markerStart + match[0].length; + const textBefore = text.slice(cursor, markerStart); + const num = match[1]; const trimmed = textBefore.trimEnd(); - if (!trimmed) return ``; - // ── Strategy 1: Use sourceMatch from citation data ───────────── - // Find the sourceMatch within the preceding text and wrap only that phrase. + if (!trimmed) { + out += textBefore + ``; + cursor = markerEnd; + continue; + } + const sourceMatch = sourceMatchMap?.[num]; if (sourceMatch) { const idx = trimmed.toLowerCase().lastIndexOf(sourceMatch.toLowerCase()); @@ -144,20 +141,18 @@ export function wrapCitationMarkers(html: string, sourceMatchMap?: CitationSourc const before = trimmed.slice(0, idx); const matched = trimmed.slice(idx, idx + sourceMatch.length); const after = trimmed.slice(idx + sourceMatch.length); - return `${before}${matched}${after}`; + out += before + `${matched}` + after; + cursor = markerEnd; + continue; } - // sourceMatch not found in text — fall through to heuristic } - // ── Strategy 2: Heuristic — last clause before [N] ─────────── const clauseMatch = trimmed.match(/(?:[,;–—]\s*)([^,;–—]+)$/); const anchor = clauseMatch ? clauseMatch[1].trim() : trimmed; - - // If the anchor is only punctuation (e.g. the [^<"] regex cut off at a - // literal quote in text content like Schedule "C".), emit an empty span - // so the CDN shows a superscript indicator instead of wrapping garbage. if (!/[a-zA-Z0-9]/.test(anchor)) { - return `${trimmed}`; + out += `${textBefore}`; + cursor = markerEnd; + continue; } const prefix = clauseMatch @@ -165,8 +160,31 @@ export function wrapCitationMarkers(html: string, sourceMatchMap?: CitationSourc clauseMatch[0].slice(0, clauseMatch[0].length - anchor.length) : ""; - return `${prefix}${anchor}`; - }); + out += `${prefix}${anchor}`; + cursor = markerEnd; + } + + return out + text.slice(cursor); +} + +/** + * Find [N] markers in HTML content and wrap the appropriate text fragment + * in a . The CDN runtime needs data-cite on inline + * elements for indicator placement. + * + * When `sourceMatchMap` is provided, the sourceMatch for each citation is used as + * the clickable display label. The function searches backward in the text + * before [N] for the sourceMatch (case-insensitive) and wraps only that + * occurrence. This produces short, scannable inline citations that match + * the evidence highlight. + * + * Without `sourceMatchMap`, falls back to wrapping the last clause before [N]. + */ +export function wrapCitationMarkers(html: string, sourceMatchMap?: CitationSourceMatchMap): string { + const segments = html.split(/(<[^>]+>)/g); + return segments + .map(segment => (segment.startsWith("<") && segment.endsWith(">") ? segment : wrapCitationMarkerTextSegment(segment, sourceMatchMap))) + .join(""); } // ── Block-level parsing ──────────────────────────────────────────── @@ -180,7 +198,7 @@ interface Block { } function parseBlocks(markdown: string): Block[] { - const lines = markdown.split("\n"); + const lines = markdown.replace(/\r\n?/g, "\n").split("\n"); const blocks: Block[] = []; let i = 0; diff --git a/src/client/DeepCitation.ts b/src/client/DeepCitation.ts index 2ed1998f..af78d529 100644 --- a/src/client/DeepCitation.ts +++ b/src/client/DeepCitation.ts @@ -98,12 +98,27 @@ export async function fetchWithRetry( try { return await doFetch(url, options); } catch (err) { + if (err instanceof Error && (err.name === "AbortError" || err.name === "TimeoutError")) { + throw err; + } lastError = err; } } throw lastError; } +function createTimeoutSignal(timeoutMs: number): { signal: AbortSignal; cleanup: () => void } { + const controller = new AbortController(); + const timeout = setTimeout(() => { + controller.abort(new DOMException(`Request timed out after ${timeoutMs}ms`, "TimeoutError")); + }, timeoutMs); + timeout.unref?.(); + return { + signal: controller.signal, + cleanup: () => clearTimeout(timeout), + }; +} + /** * Default concurrency limit for parallel file uploads. * Prevents overwhelming the network/server with too many simultaneous requests. @@ -884,7 +899,7 @@ export class DeepCitation { // Create the fetch promise and cache it const fetchPromise = (async (): Promise => { - const response = await this._fetch(`${this.apiUrl}/verifyCitations`, { + const requestInit: RequestInit = { method: "POST", headers: { ...this.baseHeaders(), "Content-Type": "application/json" }, body: JSON.stringify({ @@ -895,18 +910,26 @@ export class DeepCitation { endUserId: resolvedEndUserId, }, }), - }); - this.checkLatestVersion(response); - this.checkUsageWarning(response); - - if (!response.ok) { - // Remove from cache on error so retry is possible - this.verifyCache.delete(cacheKey); - this.logger.error?.("Verification failed", { attachmentId, status: response.status }); - throw await createApiError(response, "Verification"); - } + }; + const timeoutMs = options?.requestTimeoutMs; + const timeout = Number.isFinite(timeoutMs) && timeoutMs && timeoutMs > 0 ? createTimeoutSignal(timeoutMs) : null; + if (timeout) requestInit.signal = timeout.signal; + try { + const response = await this._fetch(`${this.apiUrl}/verifyCitations`, requestInit); + this.checkLatestVersion(response); + this.checkUsageWarning(response); + + if (!response.ok) { + // Remove from cache on error so retry is possible + this.verifyCache.delete(cacheKey); + this.logger.error?.("Verification failed", { attachmentId, status: response.status }); + throw await createApiError(response, "Verification"); + } - return normalizeVerifyResponse((await response.json()) as VerifyCitationsResponse); + return normalizeVerifyResponse((await response.json()) as VerifyCitationsResponse); + } finally { + timeout?.cleanup(); + } })(); // Force cleanup if cache is at or approaching the limit to prevent memory leaks diff --git a/src/client/types.ts b/src/client/types.ts index 7ca90a24..91eec394 100644 --- a/src/client/types.ts +++ b/src/client/types.ts @@ -279,6 +279,11 @@ export interface VerifyCitationsOptions { outputImageFormat?: ImageFormat; /** Developer's end-user identifier for usage attribution. Overrides the instance-level endUserId if set. */ endUserId?: string; + /** + * Maximum time in milliseconds to wait for the verification request before aborting. + * If omitted, verification uses the client/network default. + */ + requestTimeoutMs?: number; } /** diff --git a/src/parsing/citationParser.ts b/src/parsing/citationParser.ts index f75bbf50..3ca6b759 100644 --- a/src/parsing/citationParser.ts +++ b/src/parsing/citationParser.ts @@ -212,6 +212,58 @@ function parseCitationsFromJson(parsed: unknown): CitationData[] { return rawCitations.map(c => expandCompactKeys(c as Record)); } +function escapeLiteralControlCharactersInJsonStrings(text: string): string { + let out = ""; + let inString = false; + let escaped = false; + + for (let i = 0; i < text.length; i++) { + const ch = text[i] as string; + if (!inString) { + out += ch; + if (ch === '"') inString = true; + continue; + } + + if (escaped) { + out += ch; + escaped = false; + continue; + } + + if (ch === "\\") { + out += ch; + escaped = true; + continue; + } + + if (ch === '"') { + out += ch; + inString = false; + continue; + } + + if (ch === "\n" || ch === "\r") { + out += "\\n"; + continue; + } + + if (ch === "\t") { + out += "\\t"; + continue; + } + + if (ch.charCodeAt(0) < 0x20) { + out += `\\u${ch.charCodeAt(0).toString(16).padStart(4, "0")}`; + continue; + } + + out += ch; + } + + return out; +} + /** * Attempts to repair malformed JSON. * Handles common LLM output issues like: @@ -238,6 +290,14 @@ function repairJson(jsonString: string): { repairs.push("removed markdown code block markers"); } + // Escape literal control characters that LLMs sometimes emit inside JSON + // strings (especially multiline source_context/source_match values). + const beforeControlCharRepair = repaired; + repaired = escapeLiteralControlCharactersInJsonStrings(repaired); + if (repaired !== beforeControlCharRepair) { + repairs.push("escaped literal control characters"); + } + // Fix invalid escape sequences inside JSON strings. // Valid escapes: \" \\ \/ \b \f \n \r \t \uXXXX // Invalid escapes like \~ \x \a etc. should have the backslash removed. From 48836e9ef2d7c708fe2bdb700268934627e5df3c Mon Sep 17 00:00:00 2001 From: Benson Date: Thu, 16 Apr 2026 10:45:38 -0600 Subject: [PATCH 05/13] Tighten markdown anchor promotion --- src/__tests__/cliCommands.test.ts | 52 ++++++++++++++++++++++++++++++- src/cli/commands.ts | 43 +++++++++++++++++++------ src/cli/markdownToHtml.ts | 6 +++- 3 files changed, 90 insertions(+), 11 deletions(-) diff --git a/src/__tests__/cliCommands.test.ts b/src/__tests__/cliCommands.test.ts index 24bb5db2..bb54131e 100644 --- a/src/__tests__/cliCommands.test.ts +++ b/src/__tests__/cliCommands.test.ts @@ -904,7 +904,7 @@ describe("verify command (--citations mode)", () => { expect(mockVerifyAttachment).toHaveBeenCalledTimes(2); for (const call of mockVerifyAttachment.mock.calls) { - expect(call[2]).toMatchObject({ requestTimeoutMs: 5000 }); + expect(call[2]).toMatchObject({ requestTimeoutMs: 10_000 }); } expect(stderr).toContain("3 citations across 2 attachment(s)"); } finally { @@ -912,6 +912,56 @@ describe("verify command (--citations mode)", () => { } }); + it("does not auto-promote a display label when it has no overlap with the anchor", async () => { + const tmpDir = makeTmpDir(); + const origCwd = process.cwd(); + process.chdir(tmpDir); + mkdirSync(join(tmpDir, ".deepcitation"), { recursive: true }); + + const mdPath = join(tmpDir, "report.md"); + const outPath = join(tmpDir, "verified.html"); + writeFileSync( + mdPath, + [ + "# Report", + "", + "[**Horizontal Boundaries**](cite:1)", + "", + "<<>>", + JSON.stringify([ + { + id: 1, + attachment_id: "att-1", + source_context: "context", + source_match: "SCHEDULE “C”", + line_ids: [1], + page_id: "1_1", + }, + ]), + "<<>>", + "", + ].join("\n"), + ); + + mockVerifyAttachment.mockResolvedValueOnce({ + verifications: { key1: { status: "not_found" } }, + }); + + try { + const { stderr } = await captureOutput(() => verify(["--markdown", mdPath, "--out", outPath], fmtNetErr)); + expect(mockVerifyAttachment).toHaveBeenCalledTimes(1); + const citationsArg = mockVerifyAttachment.mock.calls[0]?.[1] as Record>; + const firstCitation = Object.values(citationsArg)[0]; + expect(firstCitation).toBeDefined(); + expect(firstCitation).toMatchObject({ + sourceMatch: "SCHEDULE “C”", + }); + expect(stderr).not.toContain("auto-promoted display label to anchor"); + } finally { + process.chdir(origCwd); + } + }); + it("writes attachments to output file when verifyAttachment returns assets", async () => { const tmpDir = makeTmpDir(); const origCwd = process.cwd(); diff --git a/src/cli/commands.ts b/src/cli/commands.ts index 4dbb6fe7..cf3a6488 100644 --- a/src/cli/commands.ts +++ b/src/cli/commands.ts @@ -241,7 +241,21 @@ Examples: const ALLOWED_THEMES = ["auto", "light", "dark"] as const; const ALLOWED_INDICATORS = ["icon", "dot", "none"] as const; -const VERIFY_REQUEST_TIMEOUT_MS = 5000; +const VERIFY_REQUEST_TIMEOUT_MS = 10_000; + +function hasMeaningfulLabelOverlap(left: string, right: string): boolean { + const tokenRe = /[a-z0-9]+/g; + const leftTokens = new Set((left.toLowerCase().match(tokenRe) ?? []).filter(token => token.length >= 3)); + if (leftTokens.size === 0) return false; + + for (const token of right.toLowerCase().match(tokenRe) ?? []) { + if (token.length >= 3 && leftTokens.has(token)) { + return true; + } + } + + return false; +} // ── helpers ─────────────────────────────────────────────────────── @@ -1117,14 +1131,14 @@ export async function verifyHtml(argv: string[], _fmtNetErr: (err: unknown) => s die(`No valid <<>> block found in the ${src} file.`, VERIFY_HELP); } - // 1b. When the model picked a short bold display label that differs from - // source_match, promote the bold text to anchor — it's what the reader - // clicks and should drive the highlight. Mutates `parsed.citations` - // before the verify API call. + // 1b. When the model picked a short bold display label that still overlaps + // the existing source_match, promote the bold text to anchor — it's what + // the reader clicks and should drive the highlight. Mutates + // `parsed.citations` before the verify API call. { + const labelsById = new Map>(); const spanRe = /<([a-zA-Z][a-zA-Z0-9]*)\s+[^>]*data-cite="(\d+)"[^>]*>([\s\S]*?)<\/\1>/g; let m: RegExpExecArray | null; - let promoted = 0; while ((m = safeExec(spanRe, parsed.visibleText)) !== null) { const id = parseInt(m[2], 10); // Strip nested tags in one pass. data-cite spans wrap at most a single @@ -1140,13 +1154,24 @@ export async function verifyHtml(argv: string[], _fmtNetErr: (err: unknown) => s visible = visible.replace(/\s+/g, " ").trim(); if (!visible) continue; - const wordCount = visible.split(/\s+/).length; - if (wordCount > 4 || visible.length > 40) continue; + let labels = labelsById.get(id); + if (!labels) { + labels = new Set(); + labelsById.set(id, labels); + } + labels.add(visible); + } + let promoted = 0; + for (const [id, labels] of labelsById.entries()) { + if (labels.size !== 1) continue; + const [visible] = [...labels]; const cd = parsed.citations.find(c => c.id === id); if (!cd) continue; + const currentAnchor = (cd.source_match ?? "").trim(); - if (currentAnchor && currentAnchor.toLowerCase() === visible.toLowerCase()) continue; + if (!currentAnchor || currentAnchor.toLowerCase() === visible.toLowerCase()) continue; + if (!hasMeaningfulLabelOverlap(currentAnchor, visible)) continue; console.error( ` [${id}] auto-promoted display label to anchor: "${visible}" (was "${currentAnchor.slice(0, 40)}${currentAnchor.length > 40 ? "…" : ""}")`, diff --git a/src/cli/markdownToHtml.ts b/src/cli/markdownToHtml.ts index 731e7fe4..9cf3664a 100644 --- a/src/cli/markdownToHtml.ts +++ b/src/cli/markdownToHtml.ts @@ -183,7 +183,11 @@ function wrapCitationMarkerTextSegment(text: string, sourceMatchMap?: CitationSo export function wrapCitationMarkers(html: string, sourceMatchMap?: CitationSourceMatchMap): string { const segments = html.split(/(<[^>]+>)/g); return segments - .map(segment => (segment.startsWith("<") && segment.endsWith(">") ? segment : wrapCitationMarkerTextSegment(segment, sourceMatchMap))) + .map(segment => { + return segment.startsWith("<") && segment.endsWith(">") + ? segment + : wrapCitationMarkerTextSegment(segment, sourceMatchMap); + }) .join(""); } From dbb841b15b2bd7e6798d122587f2dbcf18c64769 Mon Sep 17 00:00:00 2001 From: Benson Date: Thu, 16 Apr 2026 11:00:47 -0600 Subject: [PATCH 06/13] Improve markdownToHtml report shell --- CHANGELOG.md | 2 +- src/__tests__/markdownToHtml.test.ts | 20 +++++-- src/cli/markdownToHtml.ts | 80 +++++++--------------------- 3 files changed, 36 insertions(+), 66 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8709c5c0..50810fd6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,7 +21,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed -- **`verify` auto-publishes by default** — successful runs upload to My Verifications as `private`; opt-out with `--no-publish`; pass `--vis unlisted` or `--vis public` to widen visibility. (#425) +- **`verify` auto-publishes by default** — successful runs upload to My Verifications as `private`; opt-out with `--local-only`; pass `--vis unlisted` or `--vis public` to widen visibility. (#425) - **Per-attachment assets in `verify` output** — `verify-response.json` now includes an `attachments` map (pageImages, originalDownload) alongside `verifications` when any attachment returned assets. (#430) - **CDN blink animation synced with React** — CDN now imports `BLINK_*` constants from the same source as the React component, fixing a 180 ms vs 120 ms enter-duration discrepancy; implements the full three-stage enter animation with a `cancelBlink()` guard. (#427) - **Billing copy** — quota-exceeded messages and the dashboard description now reflect subscription tiers (Standard 20/week at $20/mo, Pro unlimited at $200/mo). (#427) diff --git a/src/__tests__/markdownToHtml.test.ts b/src/__tests__/markdownToHtml.test.ts index 81d45a5d..cc439be1 100644 --- a/src/__tests__/markdownToHtml.test.ts +++ b/src/__tests__/markdownToHtml.test.ts @@ -115,6 +115,14 @@ describe("markdownToHtml block parsing", () => { expect(result).toContain("

"); }); + it("renders headings with up to three leading spaces", () => { + const result = markdownToHtml(" # Title\n\n ## Section\n\n ### Sub", { style: "plain" }); + expect(result).toContain("

"); + expect(result).toContain("

"); + expect(result).toContain("

"); + expect(result).not.toContain("

### Sub

"); + }); + it("renders paragraphs", () => { const result = markdownToHtml("Hello world.\n\nSecond paragraph.", { style: "plain" }); expect(result).toContain("

Hello world.

"); @@ -200,7 +208,6 @@ describe("markdownToHtml style shells", () => { const mdWithSections = "# Report\n\n## Key Findings\n\nImportant stuff.\n\n## Details\n\nMore details."; const result = markdownToHtml(mdWithSections, { style: "report" }); expect(result).toContain(""); - expect(result).toContain("dc-verdict"); expect(result).toContain("data-dc-drawer-trigger"); }); @@ -330,6 +337,14 @@ describe("markdownToHtml report body (flat rendering)", () => { const sectionPos = result.indexOf("Content."); expect(preamblePos).toBeLessThan(sectionPos); }); + + it("does not render left-gutter section numbers in the report shell", () => { + const result = markdownToHtml("# Report\n\n## One\n\n### Two", { style: "report" }); + expect(result).not.toContain('content: "00"'); + expect(result).not.toContain("counter-reset: h2section"); + expect(result).not.toContain("h2::before"); + expect(result).not.toContain("h3::before"); + }); }); // ── markdownToHtml — citation marker integration ────────────────── @@ -492,20 +507,17 @@ describe("markdownToHtml header — claim & model", () => { it("renders a claim card when claim is provided", () => { const result = markdownToHtml("# T\nbody", { claim: "Did revenue exceed $4B?" }); expect(result).toContain('class="dc-claim"'); - expect(result).toContain(">CLAIM<"); expect(result).toContain("Did revenue exceed $4B?"); }); it("omits the claim card when claim is absent", () => { const result = markdownToHtml("# T\nbody", {}); expect(result).not.toContain('
CLAIM<"); }); it("suppresses a whitespace-only claim", () => { const result = markdownToHtml("# T\nbody", { claim: " " }); expect(result).not.toContain('
CLAIM<"); }); it("escapes HTML in the claim", () => { diff --git a/src/cli/markdownToHtml.ts b/src/cli/markdownToHtml.ts index 9cf3664a..ec575140 100644 --- a/src/cli/markdownToHtml.ts +++ b/src/cli/markdownToHtml.ts @@ -201,6 +201,18 @@ interface Block { language?: string; // for code blocks } +function parseAtxHeading(line: string): { level: number; content: string } | null { + const match = line.match(/^ {0,3}(#{1,6})\s+(.+)$/); + if (!match) { + return null; + } + + return { + level: match[1].length, + content: match[2].trimEnd(), + }; +} + function parseBlocks(markdown: string): Block[] { const lines = markdown.replace(/\r\n?/g, "\n").split("\n"); const blocks: Block[] = []; @@ -234,12 +246,12 @@ function parseBlocks(markdown: string): Block[] { } // Heading - const headingMatch = line.match(/^(#{1,6})\s+(.+)$/); - if (headingMatch) { + const heading = parseAtxHeading(line); + if (heading) { blocks.push({ type: "heading", - level: headingMatch[1].length, - content: headingMatch[2], + level: heading.level, + content: heading.content, }); i++; continue; @@ -297,7 +309,7 @@ function parseBlocks(markdown: string): Block[] { while ( i < lines.length && lines[i].trim() !== "" && - !/^#{1,6}\s/.test(lines[i]) && + !parseAtxHeading(lines[i]) && !/^(-{3,}|\*{3,}|_{3,})\s*$/.test(lines[i]) && !lines[i].trim().startsWith("```") && !/^\s*[-*+]\s+/.test(lines[i]) && @@ -552,7 +564,6 @@ function reportShell(title: string, bodyHtml: string, options: MarkdownToHtmlOpt const claimText = options.claim?.trim(); const claimCard = claimText ? `
-CLAIM
${inlineFormat(claimText)}
` : ""; @@ -576,21 +587,9 @@ ${REVIEW_SHARED_BASE_CSS} -webkit-font-smoothing: antialiased; max-width: 900px; margin: 0 auto; - padding: 3rem 1.5rem 4rem 6.5rem; - counter-reset: h2section; - } - body > header { margin-bottom: 2rem; position: relative; } - body > header::before { - content: "00"; - position: absolute; - left: -5rem; - top: 0.4rem; - font-family: var(--dc-font-family-mono); - font-size: 12px; - font-weight: 500; - color: var(--dc-border); - letter-spacing: 0.05em; + padding: 3rem 1.5rem 4rem; } + body > header { margin-bottom: 2rem; } body > header h1 { font-size: 30px; font-weight: 600; @@ -615,58 +614,19 @@ ${REVIEW_SHARED_BASE_CSS} .dc-meta-link { color: var(--dc-primary); text-decoration: none; font-weight: 500; } .dc-meta-link:hover { text-decoration: underline; } [data-cite] strong { font-weight: 600; } - .dc-verdict { - display: flex; - gap: 1.5rem; - padding: 0.85rem 1rem; - margin-bottom: 2.25rem; - font-family: var(--dc-font-family-mono); - font-size: 12px; - border: 1px solid var(--dc-border); - background: var(--dc-muted); - } - .dc-verdict .v-found { color: var(--dc-verified); } - .dc-verdict .v-partial { color: var(--dc-partial); } - .dc-verdict .v-miss { color: var(--dc-destructive); } h1 { font-size: 30px; font-weight: 600; letter-spacing: -0.02em; } h2 { - counter-increment: h2section; - counter-reset: h3section; font-size: 20px; font-weight: 600; margin: 2.75rem 0 0.85rem; padding-bottom: 0.5rem; border-bottom: 1px solid var(--dc-border); letter-spacing: -0.01em; - position: relative; - } - h2::before { - content: counter(h2section, decimal-leading-zero); - position: absolute; - left: -5rem; - top: 0.35rem; - font-family: var(--dc-font-family-mono); - font-size: 12px; - font-weight: 500; - color: var(--dc-primary); - letter-spacing: 0.05em; } h3 { - counter-increment: h3section; font-size: 16px; font-weight: 600; margin: 1.75rem 0 0.5rem; - position: relative; - } - h3::before { - content: counter(h2section, decimal-leading-zero) "." counter(h3section); - position: absolute; - left: -5rem; - top: 0.2rem; - font-family: var(--dc-font-family-mono); - font-size: 11px; - font-weight: 500; - color: var(--dc-subtle-foreground); } .dc-section { background: var(--dc-background); border: 1px solid var(--dc-border); padding: 1.25rem 1.5rem; margin: 1rem 0; } .mono { font-family: var(--dc-font-family-mono); font-size: 14px; font-weight: 500; } @@ -729,7 +689,6 @@ ${REVIEW_SHARED_BASE_CSS} .dc-claim-text em { font-style: italic; } @media (max-width: 720px) { body { padding: 2rem 1.25rem 3rem; } - body > header::before, h2::before, h3::before { position: static; display: block; margin-bottom: 0.2rem; } .dc-footer { margin-left: 0; padding-left: 0; } } @media print { @@ -751,7 +710,6 @@ ${
` : "" } -
${bodyHtml}