Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions scripts/corpus/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,20 @@ pnpm corpus:update-registry

`pnpm corpus:pull` now tolerates missing keys and prunes stale `registry.json` entries automatically.
`pnpm corpus:pull` does not remove local files that no longer exist in R2; use `pnpm corpus:delete` when you want the shared corpus and local copy removed together.

### Worktrees: seeding from the primary repo

When `pnpm corpus:pull` runs inside a git worktree it first hardlinks anything
the primary repo already has on disk, then only goes to R2 for the rest. The
primary's bytes are already local so this is effectively instant for a fresh
worktree.

- Hardlinks (not copies) — zero disk overhead, both worktrees see the same
inode. Falls back to copy automatically when the two checkouts live on
different filesystems.
- `--force` still re-downloads from R2. Destinations are unlinked before each
R2 write, so the primary's files are never clobbered.
- Pass `--no-seed` to skip the hardlink step and go straight to R2.
`pnpm corpus:push` runs `superdoc-benchmark baseline <uploaded-key> --force` by default after upload.
Set `SUPERDOC_CORPUS_SKIP_WORD_BASELINE=1` (or pass `--no-word-baseline`) to disable this behavior.

Expand Down
34 changes: 31 additions & 3 deletions scripts/corpus/pull.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import {
normalizePath,
printCorpusEnvHint,
saveRegistry,
seedCorpusFromPrimary,
sortRegistryDocs,
writeProgressBar,
} from './shared.mjs';
Expand Down Expand Up @@ -48,6 +49,8 @@ Options:
--match <text> Substring filter (repeatable)
--exclude <prefix> Exclude filter (repeatable)
--force Re-download files even if they already exist
--no-seed In a git worktree, skip hardlinking from the primary
repo's corpus before pulling from R2
--link-visual Point tests/visual/test-data at --dest via symlink
--dry-run Print actions without downloading
--quiet Suppress verbose logs; show only progress and summary
Expand All @@ -65,6 +68,7 @@ function parseArgs(argv) {
linkVisual: false,
dryRun: false,
quiet: false,
seedFromPrimary: true,
};

for (let i = 0; i < argv.length; i += 1) {
Expand Down Expand Up @@ -99,6 +103,10 @@ function parseArgs(argv) {
args.force = true;
continue;
}
if (arg === '--no-seed') {
args.seedFromPrimary = false;
continue;
}
if (arg === '--link-visual') {
args.linkVisual = true;
continue;
Expand Down Expand Up @@ -247,13 +255,21 @@ async function main() {

let downloaded = 0;
let skipped = 0;
let seeded = 0;

if (!args.quiet) {
console.log(`[corpus] Source: ${corpus.source}`);
console.log(`[corpus] Destination: ${destinationRoot}`);
console.log(`[corpus] Corpus size: ${selectedDocs.length} documents`);
}

// Fast path for git worktrees: hardlink from the primary repo's corpus
// before reaching for R2. Downloads below still run for anything the
// primary is missing, so --force or a fresh fixture still trigger R2.
if (args.seedFromPrimary && !args.force && !args.dryRun) {
seeded = seedCorpusFromPrimary(destinationRoot, selectedDocs, { quiet: args.quiet });
Comment on lines +269 to +270
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Seed corpus only after filtering out missing bucket keys

seedCorpusFromPrimary runs on selectedDocs before the registry entries are filtered against client.listObjects(''). If registry.json contains a stale key that no longer exists in R2 but the primary checkout still has that file, this run seeds it locally, then marks it missing and prunes the registry without removing the seeded file. That leaves fresh worktrees with docs the same run has already determined are absent from the bucket; seeding should use existingDocs only.

Useful? React with 👍 / 👎.

}

if (corpus.source === REGISTRY_KEY && corpus.registry) {
const allObjectKeys = await client.listObjects('');
const objectKeySet = new Set(allObjectKeys.map((key) => normalizePath(key).toLowerCase()));
Expand Down Expand Up @@ -316,6 +332,14 @@ async function main() {
const { relativePath, objectKey, destinationPath } = toDownload[idx];

try {
// Unlink before writing. If the destination is a hardlink to the
// primary repo's corpus (seeded above), wrangler's r2 object get
// would otherwise write through and mutate the primary's file.
try {
fs.rmSync(destinationPath, { force: true });
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Preserve existing file until replacement download succeeds

worker() now unconditionally deletes destinationPath before getObjectToFile. In --force runs (or any case with an existing file), a transient R2/wrangler error will remove the previously valid local DOCX and then abort, leaving the corpus incomplete. This regresses pull resiliency versus replacing the file only after a successful download; using a temp file + rename (or unlinking only known hardlinked seeds) avoids this data-loss path.

Useful? React with 👍 / 👎.

} catch {
// swallow — the write below will surface any real permission issue
}
await client.getObjectToFile(objectKey, destinationPath);
downloaded += 1;
} catch (error) {
Expand Down Expand Up @@ -379,12 +403,16 @@ async function main() {

const elapsed = Date.now() - startedAt;
if (args.quiet) {
if (downloaded > 0) {
console.log(`[corpus] Synced ${downloaded} new document(s) in ${formatDurationMs(elapsed)}`);
if (downloaded > 0 || seeded > 0) {
const parts = [];
if (seeded > 0) parts.push(`${seeded} seeded`);
if (downloaded > 0) parts.push(`${downloaded} downloaded`);
console.log(`[corpus] Synced ${parts.join(' + ')} in ${formatDurationMs(elapsed)}`);
}
} else {
const seedPart = seeded > 0 ? `, Seeded: ${seeded}` : '';
console.log(
`[corpus] Done. Downloaded: ${downloaded}, Skipped: ${skipped}, Missing: ${missingRegistryPaths.length}, Elapsed: ${formatDurationMs(elapsed)}`,
`[corpus] Done. Downloaded: ${downloaded}${seedPart}, Skipped: ${skipped}, Missing: ${missingRegistryPaths.length}, Elapsed: ${formatDurationMs(elapsed)}`,
);
}
} finally {
Expand Down
79 changes: 78 additions & 1 deletion scripts/corpus/shared.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import os from 'node:os';
import path from 'node:path';
import crypto from 'node:crypto';
import process from 'node:process';
import { execFile as execFileCb } from 'node:child_process';
import { execFile as execFileCb, execFileSync } from 'node:child_process';
import { fileURLToPath, pathToFileURL } from 'node:url';
import { promisify } from 'node:util';
import { createRequire } from 'node:module';
Expand Down Expand Up @@ -468,6 +468,83 @@ export function ensureVisualTestDataSymlink(corpusRoot) {
return { linked: true, changed: true, backupPath: null };
}

/**
* If the current repo is a git worktree, return the primary repo's root path.
* Uses `git rev-parse --git-common-dir`: in a worktree this resolves to
* `<primary>/.git/worktrees/<name>/..` pointing inside the primary's .git dir,
* while in a non-worktree checkout it resolves to the local `.git`.
* Returns null if we're not in a worktree, not in git, or git isn't available.
*/
export function findPrimaryRepoRoot() {
try {
const commonDir = execFileSync('git', ['rev-parse', '--git-common-dir'], {
cwd: REPO_ROOT,
stdio: ['ignore', 'pipe', 'ignore'],
encoding: 'utf8',
}).trim();
const absoluteCommonDir = path.resolve(REPO_ROOT, commonDir);
const ownGitDir = path.resolve(REPO_ROOT, '.git');

// Non-worktree: common dir IS our own .git
if (absoluteCommonDir === ownGitDir) return null;

// Worktree: common dir is the primary's .git (or a file pointer); its parent is the primary repo
const primaryRoot = path.dirname(absoluteCommonDir);
if (primaryRoot === REPO_ROOT) return null;
return primaryRoot;
} catch {
return null;
}
}

/**
* Seed the worktree's corpus from the primary repo using hardlinks (falling
* back to copy across filesystems). Skips files that already exist at the
* destination and files the primary doesn't have. Returns the count seeded.
*
* Only runs when we're in a worktree AND the primary has corpus files.
*/
export function seedCorpusFromPrimary(destinationRoot, selectedDocs, { quiet = false } = {}) {
const primaryRoot = findPrimaryRepoRoot();
if (!primaryRoot) return 0;

// Follow the primary's default corpus layout. This is intentionally a fixed
// path — users with custom --dest in the primary can still fall back to R2.
const primaryCorpus = path.join(primaryRoot, path.basename(DEFAULT_CORPUS_ROOT));
if (!fs.existsSync(primaryCorpus)) return 0;

let seeded = 0;
let copyFallback = 0;

for (const doc of selectedDocs) {
const relativePath = normalizePath(doc.relative_path);
if (!relativePath) continue;

const primaryPath = path.join(primaryCorpus, relativePath);
const destinationPath = path.join(destinationRoot, relativePath);

if (fs.existsSync(destinationPath)) continue;
if (!fs.existsSync(primaryPath)) continue;

fs.mkdirSync(path.dirname(destinationPath), { recursive: true });
try {
fs.linkSync(primaryPath, destinationPath);
} catch {
// Hardlink fails across filesystems or for special files — copy instead
fs.copyFileSync(primaryPath, destinationPath);
copyFallback += 1;
}
seeded += 1;
}

if (seeded > 0 && !quiet) {
const relPrimary = path.relative(REPO_ROOT, primaryRoot) || primaryRoot;
const method = copyFallback === seeded ? 'copied' : copyFallback > 0 ? 'hardlinked (with copy fallback)' : 'hardlinked';
console.log(`[corpus] Seeded ${seeded} file(s) ${method} from primary repo: ${relPrimary}`);
}
return seeded;
}

export function applyPathFilters(paths, { filters = [], matches = [], excludes = [] } = {}) {
const normalizedFilters = filters.map((value) => String(value).toLowerCase()).filter(Boolean);
const normalizedMatches = matches.map((value) => String(value).toLowerCase()).filter(Boolean);
Expand Down
Loading