From c7f2478c8adbee4cb7a14dd706a683698e4298bc Mon Sep 17 00:00:00 2001 From: Caio Pizzol Date: Mon, 20 Apr 2026 12:55:56 -0300 Subject: [PATCH] feat(corpus): seed worktree pulls from the primary repo via hardlink Pulling the full corpus (~450 .docx files, ~100 MB) on every new worktree is wasteful: the bytes are identical to the primary repo already sitting on disk. Detect when we're in a git worktree via `git rev-parse --git-common-dir`, then hardlink any files the primary has into the worktree's corpus dir before hitting R2. Only files the primary is missing go over the network. --force still re-downloads from R2 (we unlink the destination before writing so the primary's inodes never get clobbered). --no-seed opts out for users who want the old behaviour. --- scripts/corpus/README.md | 14 +++++++ scripts/corpus/pull.mjs | 34 +++++++++++++++-- scripts/corpus/shared.mjs | 79 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 123 insertions(+), 4 deletions(-) diff --git a/scripts/corpus/README.md b/scripts/corpus/README.md index bcf560fa20..0ff102f4b5 100644 --- a/scripts/corpus/README.md +++ b/scripts/corpus/README.md @@ -24,6 +24,20 @@ pnpm corpus:update-registry `pnpm corpus:pull` now tolerates missing keys and prunes stale `registry.json` entries automatically. `pnpm corpus:pull` does not remove local files that no longer exist in R2; use `pnpm corpus:delete` when you want the shared corpus and local copy removed together. + +### Worktrees: seeding from the primary repo + +When `pnpm corpus:pull` runs inside a git worktree it first hardlinks anything +the primary repo already has on disk, then only goes to R2 for the rest. The +primary's bytes are already local so this is effectively instant for a fresh +worktree. + +- Hardlinks (not copies) — zero disk overhead, both worktrees see the same + inode. Falls back to copy automatically when the two checkouts live on + different filesystems. +- `--force` still re-downloads from R2. Destinations are unlinked before each + R2 write, so the primary's files are never clobbered. +- Pass `--no-seed` to skip the hardlink step and go straight to R2. `pnpm corpus:push` runs `superdoc-benchmark baseline --force` by default after upload. Set `SUPERDOC_CORPUS_SKIP_WORD_BASELINE=1` (or pass `--no-word-baseline`) to disable this behavior. diff --git a/scripts/corpus/pull.mjs b/scripts/corpus/pull.mjs index 4c6998dfd1..7acc067969 100644 --- a/scripts/corpus/pull.mjs +++ b/scripts/corpus/pull.mjs @@ -17,6 +17,7 @@ import { normalizePath, printCorpusEnvHint, saveRegistry, + seedCorpusFromPrimary, sortRegistryDocs, writeProgressBar, } from './shared.mjs'; @@ -48,6 +49,8 @@ Options: --match Substring filter (repeatable) --exclude Exclude filter (repeatable) --force Re-download files even if they already exist + --no-seed In a git worktree, skip hardlinking from the primary + repo's corpus before pulling from R2 --link-visual Point tests/visual/test-data at --dest via symlink --dry-run Print actions without downloading --quiet Suppress verbose logs; show only progress and summary @@ -65,6 +68,7 @@ function parseArgs(argv) { linkVisual: false, dryRun: false, quiet: false, + seedFromPrimary: true, }; for (let i = 0; i < argv.length; i += 1) { @@ -99,6 +103,10 @@ function parseArgs(argv) { args.force = true; continue; } + if (arg === '--no-seed') { + args.seedFromPrimary = false; + continue; + } if (arg === '--link-visual') { args.linkVisual = true; continue; @@ -247,6 +255,7 @@ async function main() { let downloaded = 0; let skipped = 0; + let seeded = 0; if (!args.quiet) { console.log(`[corpus] Source: ${corpus.source}`); @@ -254,6 +263,13 @@ async function main() { console.log(`[corpus] Corpus size: ${selectedDocs.length} documents`); } + // Fast path for git worktrees: hardlink from the primary repo's corpus + // before reaching for R2. Downloads below still run for anything the + // primary is missing, so --force or a fresh fixture still trigger R2. + if (args.seedFromPrimary && !args.force && !args.dryRun) { + seeded = seedCorpusFromPrimary(destinationRoot, selectedDocs, { quiet: args.quiet }); + } + if (corpus.source === REGISTRY_KEY && corpus.registry) { const allObjectKeys = await client.listObjects(''); const objectKeySet = new Set(allObjectKeys.map((key) => normalizePath(key).toLowerCase())); @@ -316,6 +332,14 @@ async function main() { const { relativePath, objectKey, destinationPath } = toDownload[idx]; try { + // Unlink before writing. If the destination is a hardlink to the + // primary repo's corpus (seeded above), wrangler's r2 object get + // would otherwise write through and mutate the primary's file. + try { + fs.rmSync(destinationPath, { force: true }); + } catch { + // swallow — the write below will surface any real permission issue + } await client.getObjectToFile(objectKey, destinationPath); downloaded += 1; } catch (error) { @@ -379,12 +403,16 @@ async function main() { const elapsed = Date.now() - startedAt; if (args.quiet) { - if (downloaded > 0) { - console.log(`[corpus] Synced ${downloaded} new document(s) in ${formatDurationMs(elapsed)}`); + if (downloaded > 0 || seeded > 0) { + const parts = []; + if (seeded > 0) parts.push(`${seeded} seeded`); + if (downloaded > 0) parts.push(`${downloaded} downloaded`); + console.log(`[corpus] Synced ${parts.join(' + ')} in ${formatDurationMs(elapsed)}`); } } else { + const seedPart = seeded > 0 ? `, Seeded: ${seeded}` : ''; console.log( - `[corpus] Done. Downloaded: ${downloaded}, Skipped: ${skipped}, Missing: ${missingRegistryPaths.length}, Elapsed: ${formatDurationMs(elapsed)}`, + `[corpus] Done. Downloaded: ${downloaded}${seedPart}, Skipped: ${skipped}, Missing: ${missingRegistryPaths.length}, Elapsed: ${formatDurationMs(elapsed)}`, ); } } finally { diff --git a/scripts/corpus/shared.mjs b/scripts/corpus/shared.mjs index 9fb38dacdc..e104ecf442 100644 --- a/scripts/corpus/shared.mjs +++ b/scripts/corpus/shared.mjs @@ -3,7 +3,7 @@ import os from 'node:os'; import path from 'node:path'; import crypto from 'node:crypto'; import process from 'node:process'; -import { execFile as execFileCb } from 'node:child_process'; +import { execFile as execFileCb, execFileSync } from 'node:child_process'; import { fileURLToPath, pathToFileURL } from 'node:url'; import { promisify } from 'node:util'; import { createRequire } from 'node:module'; @@ -468,6 +468,83 @@ export function ensureVisualTestDataSymlink(corpusRoot) { return { linked: true, changed: true, backupPath: null }; } +/** + * If the current repo is a git worktree, return the primary repo's root path. + * Uses `git rev-parse --git-common-dir`: in a worktree this resolves to + * `/.git/worktrees//..` pointing inside the primary's .git dir, + * while in a non-worktree checkout it resolves to the local `.git`. + * Returns null if we're not in a worktree, not in git, or git isn't available. + */ +export function findPrimaryRepoRoot() { + try { + const commonDir = execFileSync('git', ['rev-parse', '--git-common-dir'], { + cwd: REPO_ROOT, + stdio: ['ignore', 'pipe', 'ignore'], + encoding: 'utf8', + }).trim(); + const absoluteCommonDir = path.resolve(REPO_ROOT, commonDir); + const ownGitDir = path.resolve(REPO_ROOT, '.git'); + + // Non-worktree: common dir IS our own .git + if (absoluteCommonDir === ownGitDir) return null; + + // Worktree: common dir is the primary's .git (or a file pointer); its parent is the primary repo + const primaryRoot = path.dirname(absoluteCommonDir); + if (primaryRoot === REPO_ROOT) return null; + return primaryRoot; + } catch { + return null; + } +} + +/** + * Seed the worktree's corpus from the primary repo using hardlinks (falling + * back to copy across filesystems). Skips files that already exist at the + * destination and files the primary doesn't have. Returns the count seeded. + * + * Only runs when we're in a worktree AND the primary has corpus files. + */ +export function seedCorpusFromPrimary(destinationRoot, selectedDocs, { quiet = false } = {}) { + const primaryRoot = findPrimaryRepoRoot(); + if (!primaryRoot) return 0; + + // Follow the primary's default corpus layout. This is intentionally a fixed + // path — users with custom --dest in the primary can still fall back to R2. + const primaryCorpus = path.join(primaryRoot, path.basename(DEFAULT_CORPUS_ROOT)); + if (!fs.existsSync(primaryCorpus)) return 0; + + let seeded = 0; + let copyFallback = 0; + + for (const doc of selectedDocs) { + const relativePath = normalizePath(doc.relative_path); + if (!relativePath) continue; + + const primaryPath = path.join(primaryCorpus, relativePath); + const destinationPath = path.join(destinationRoot, relativePath); + + if (fs.existsSync(destinationPath)) continue; + if (!fs.existsSync(primaryPath)) continue; + + fs.mkdirSync(path.dirname(destinationPath), { recursive: true }); + try { + fs.linkSync(primaryPath, destinationPath); + } catch { + // Hardlink fails across filesystems or for special files — copy instead + fs.copyFileSync(primaryPath, destinationPath); + copyFallback += 1; + } + seeded += 1; + } + + if (seeded > 0 && !quiet) { + const relPrimary = path.relative(REPO_ROOT, primaryRoot) || primaryRoot; + const method = copyFallback === seeded ? 'copied' : copyFallback > 0 ? 'hardlinked (with copy fallback)' : 'hardlinked'; + console.log(`[corpus] Seeded ${seeded} file(s) ${method} from primary repo: ${relPrimary}`); + } + return seeded; +} + export function applyPathFilters(paths, { filters = [], matches = [], excludes = [] } = {}) { const normalizedFilters = filters.map((value) => String(value).toLowerCase()).filter(Boolean); const normalizedMatches = matches.map((value) => String(value).toLowerCase()).filter(Boolean);