From c7f2478c8adbee4cb7a14dd706a683698e4298bc Mon Sep 17 00:00:00 2001
From: Caio Pizzol <caio@harbourshare.com>
Date: Mon, 20 Apr 2026 12:55:56 -0300
Subject: [PATCH] feat(corpus): seed worktree pulls from the primary repo via
 hardlink

Pulling the full corpus (~450 .docx files, ~100 MB) on every new
worktree is wasteful: the bytes are identical to the primary repo
already sitting on disk. Detect when we're in a git worktree via
`git rev-parse --git-common-dir`, then hardlink any files the primary
has into the worktree's corpus dir before hitting R2.

Only files the primary is missing go over the network. --force still
re-downloads from R2 (we unlink the destination before writing so the
primary's inodes never get clobbered). --no-seed opts out for users
who want the old behaviour.
---
 scripts/corpus/README.md  | 14 +++++++
 scripts/corpus/pull.mjs   | 34 +++++++++++++++--
 scripts/corpus/shared.mjs | 79 ++++++++++++++++++++++++++++++++++++++-
 3 files changed, 123 insertions(+), 4 deletions(-)
diff --git a/scripts/corpus/README.md b/scripts/corpus/README.md
index bcf560fa20..0ff102f4b5 100644
--- a/scripts/corpus/README.md
+++ b/scripts/corpus/README.md
@@ -24,6 +24,20 @@ pnpm corpus:update-registry
 
 `pnpm corpus:pull` now tolerates missing keys and prunes stale `registry.json` entries automatically.
 `pnpm corpus:pull` does not remove local files that no longer exist in R2; use `pnpm corpus:delete` when you want the shared corpus and local copy removed together.
+
+### Worktrees: seeding from the primary repo
+
+When `pnpm corpus:pull` runs inside a git worktree it first hardlinks anything
+the primary repo already has on disk, then only goes to R2 for the rest. The
+primary's bytes are already local so this is effectively instant for a fresh
+worktree.
+
+- Hardlinks (not copies) — zero disk overhead, both worktrees see the same
+  inode. Falls back to copy automatically when the two checkouts live on
+  different filesystems.
+- `--force` still re-downloads from R2. Destinations are unlinked before each
+  R2 write, so the primary's files are never clobbered.
+- Pass `--no-seed` to skip the hardlink step and go straight to R2.
 `pnpm corpus:push` runs `superdoc-benchmark baseline <uploaded-key> --force` by default after upload.
 Set `SUPERDOC_CORPUS_SKIP_WORD_BASELINE=1` (or pass `--no-word-baseline`) to disable this behavior.
 
diff --git a/scripts/corpus/pull.mjs b/scripts/corpus/pull.mjs
index 4c6998dfd1..7acc067969 100644
--- a/scripts/corpus/pull.mjs
+++ b/scripts/corpus/pull.mjs
@@ -17,6 +17,7 @@ import {
   normalizePath,
   printCorpusEnvHint,
   saveRegistry,
+  seedCorpusFromPrimary,
   sortRegistryDocs,
   writeProgressBar,
 } from './shared.mjs';
@@ -48,6 +49,8 @@ Options:
       --match <text>       Substring filter (repeatable)
       --exclude <prefix>   Exclude filter (repeatable)
       --force              Re-download files even if they already exist
+      --no-seed            In a git worktree, skip hardlinking from the primary
+                           repo's corpus before pulling from R2
       --link-visual        Point tests/visual/test-data at --dest via symlink
       --dry-run            Print actions without downloading
       --quiet              Suppress verbose logs; show only progress and summary
@@ -65,6 +68,7 @@ function parseArgs(argv) {
     linkVisual: false,
     dryRun: false,
     quiet: false,
+    seedFromPrimary: true,
   };
 
   for (let i = 0; i < argv.length; i += 1) {
@@ -99,6 +103,10 @@ function parseArgs(argv) {
       args.force = true;
       continue;
     }
+    if (arg === '--no-seed') {
+      args.seedFromPrimary = false;
+      continue;
+    }
     if (arg === '--link-visual') {
       args.linkVisual = true;
       continue;
@@ -247,6 +255,7 @@ async function main() {
 
     let downloaded = 0;
     let skipped = 0;
+    let seeded = 0;
 
     if (!args.quiet) {
       console.log(`[corpus] Source: ${corpus.source}`);
@@ -254,6 +263,13 @@ async function main() {
       console.log(`[corpus] Corpus size: ${selectedDocs.length} documents`);
     }
 
+    // Fast path for git worktrees: hardlink from the primary repo's corpus
+    // before reaching for R2. Downloads below still run for anything the
+    // primary is missing, so --force or a fresh fixture still trigger R2.
+    if (args.seedFromPrimary && !args.force && !args.dryRun) {
+      seeded = seedCorpusFromPrimary(destinationRoot, selectedDocs, { quiet: args.quiet });
+    }
+
     if (corpus.source === REGISTRY_KEY && corpus.registry) {
       const allObjectKeys = await client.listObjects('');
       const objectKeySet = new Set(allObjectKeys.map((key) => normalizePath(key).toLowerCase()));
@@ -316,6 +332,14 @@ async function main() {
           const { relativePath, objectKey, destinationPath } = toDownload[idx];
 
           try {
+            // Unlink before writing. If the destination is a hardlink to the
+            // primary repo's corpus (seeded above), wrangler's r2 object get
+            // would otherwise write through and mutate the primary's file.
+            try {
+              fs.rmSync(destinationPath, { force: true });
+            } catch {
+              // swallow — the write below will surface any real permission issue
+            }
             await client.getObjectToFile(objectKey, destinationPath);
             downloaded += 1;
           } catch (error) {
@@ -379,12 +403,16 @@ async function main() {
 
     const elapsed = Date.now() - startedAt;
     if (args.quiet) {
-      if (downloaded > 0) {
-        console.log(`[corpus] Synced ${downloaded} new document(s) in ${formatDurationMs(elapsed)}`);
+      if (downloaded > 0 || seeded > 0) {
+        const parts = [];
+        if (seeded > 0) parts.push(`${seeded} seeded`);
+        if (downloaded > 0) parts.push(`${downloaded} downloaded`);
+        console.log(`[corpus] Synced ${parts.join(' + ')} in ${formatDurationMs(elapsed)}`);
       }
     } else {
+      const seedPart = seeded > 0 ? `, Seeded: ${seeded}` : '';
       console.log(
-        `[corpus] Done. Downloaded: ${downloaded}, Skipped: ${skipped}, Missing: ${missingRegistryPaths.length}, Elapsed: ${formatDurationMs(elapsed)}`,
+        `[corpus] Done. Downloaded: ${downloaded}${seedPart}, Skipped: ${skipped}, Missing: ${missingRegistryPaths.length}, Elapsed: ${formatDurationMs(elapsed)}`,
       );
     }
   } finally {
diff --git a/scripts/corpus/shared.mjs b/scripts/corpus/shared.mjs
index 9fb38dacdc..e104ecf442 100644
--- a/scripts/corpus/shared.mjs
+++ b/scripts/corpus/shared.mjs
@@ -3,7 +3,7 @@ import os from 'node:os';
 import path from 'node:path';
 import crypto from 'node:crypto';
 import process from 'node:process';
-import { execFile as execFileCb } from 'node:child_process';
+import { execFile as execFileCb, execFileSync } from 'node:child_process';
 import { fileURLToPath, pathToFileURL } from 'node:url';
 import { promisify } from 'node:util';
 import { createRequire } from 'node:module';
@@ -468,6 +468,83 @@ export function ensureVisualTestDataSymlink(corpusRoot) {
   return { linked: true, changed: true, backupPath: null };
 }
 
+/**
+ * If the current repo is a git worktree, return the primary repo's root path.
+ * Uses `git rev-parse --git-common-dir`: in a worktree this resolves to
+ * `<primary>/.git/worktrees/<name>/..` pointing inside the primary's .git dir,
+ * while in a non-worktree checkout it resolves to the local `.git`.
+ * Returns null if we're not in a worktree, not in git, or git isn't available.
+ */
+export function findPrimaryRepoRoot() {
+  try {
+    const commonDir = execFileSync('git', ['rev-parse', '--git-common-dir'], {
+      cwd: REPO_ROOT,
+      stdio: ['ignore', 'pipe', 'ignore'],
+      encoding: 'utf8',
+    }).trim();
+    const absoluteCommonDir = path.resolve(REPO_ROOT, commonDir);
+    const ownGitDir = path.resolve(REPO_ROOT, '.git');
+
+    // Non-worktree: common dir IS our own .git
+    if (absoluteCommonDir === ownGitDir) return null;
+
+    // Worktree: common dir is the primary's .git (or a file pointer); its parent is the primary repo
+    const primaryRoot = path.dirname(absoluteCommonDir);
+    if (primaryRoot === REPO_ROOT) return null;
+    return primaryRoot;
+  } catch {
+    return null;
+  }
+}
+
+/**
+ * Seed the worktree's corpus from the primary repo using hardlinks (falling
+ * back to copy across filesystems). Skips files that already exist at the
+ * destination and files the primary doesn't have. Returns the count seeded.
+ *
+ * Only runs when we're in a worktree AND the primary has corpus files.
+ */
+export function seedCorpusFromPrimary(destinationRoot, selectedDocs, { quiet = false } = {}) {
+  const primaryRoot = findPrimaryRepoRoot();
+  if (!primaryRoot) return 0;
+
+  // Follow the primary's default corpus layout. This is intentionally a fixed
+  // path — users with custom --dest in the primary can still fall back to R2.
+  const primaryCorpus = path.join(primaryRoot, path.basename(DEFAULT_CORPUS_ROOT));
+  if (!fs.existsSync(primaryCorpus)) return 0;
+
+  let seeded = 0;
+  let copyFallback = 0;
+
+  for (const doc of selectedDocs) {
+    const relativePath = normalizePath(doc.relative_path);
+    if (!relativePath) continue;
+
+    const primaryPath = path.join(primaryCorpus, relativePath);
+    const destinationPath = path.join(destinationRoot, relativePath);
+
+    if (fs.existsSync(destinationPath)) continue;
+    if (!fs.existsSync(primaryPath)) continue;
+
+    fs.mkdirSync(path.dirname(destinationPath), { recursive: true });
+    try {
+      fs.linkSync(primaryPath, destinationPath);
+    } catch {
+      // Hardlink fails across filesystems or for special files — copy instead
+      fs.copyFileSync(primaryPath, destinationPath);
+      copyFallback += 1;
+    }
+    seeded += 1;
+  }
+
+  if (seeded > 0 && !quiet) {
+    const relPrimary = path.relative(REPO_ROOT, primaryRoot) || primaryRoot;
+    const method = copyFallback === seeded ? 'copied' : copyFallback > 0 ? 'hardlinked (with copy fallback)' : 'hardlinked';
+    console.log(`[corpus] Seeded ${seeded} file(s) ${method} from primary repo: ${relPrimary}`);
+  }
+  return seeded;
+}
+
 export function applyPathFilters(paths, { filters = [], matches = [], excludes = [] } = {}) {
   const normalizedFilters = filters.map((value) => String(value).toLowerCase()).filter(Boolean);
   const normalizedMatches = matches.map((value) => String(value).toLowerCase()).filter(Boolean);