-
Notifications
You must be signed in to change notification settings - Fork 131
feat(corpus): seed worktree pulls from the primary repo #2867
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,6 +17,7 @@ import { | |
| normalizePath, | ||
| printCorpusEnvHint, | ||
| saveRegistry, | ||
| seedCorpusFromPrimary, | ||
| sortRegistryDocs, | ||
| writeProgressBar, | ||
| } from './shared.mjs'; | ||
|
|
@@ -48,6 +49,8 @@ Options: | |
| --match <text> Substring filter (repeatable) | ||
| --exclude <prefix> Exclude filter (repeatable) | ||
| --force Re-download files even if they already exist | ||
| --no-seed In a git worktree, skip hardlinking from the primary | ||
| repo's corpus before pulling from R2 | ||
| --link-visual Point tests/visual/test-data at --dest via symlink | ||
| --dry-run Print actions without downloading | ||
| --quiet Suppress verbose logs; show only progress and summary | ||
|
|
@@ -65,6 +68,7 @@ function parseArgs(argv) { | |
| linkVisual: false, | ||
| dryRun: false, | ||
| quiet: false, | ||
| seedFromPrimary: true, | ||
| }; | ||
|
|
||
| for (let i = 0; i < argv.length; i += 1) { | ||
|
|
@@ -99,6 +103,10 @@ function parseArgs(argv) { | |
| args.force = true; | ||
| continue; | ||
| } | ||
| if (arg === '--no-seed') { | ||
| args.seedFromPrimary = false; | ||
| continue; | ||
| } | ||
| if (arg === '--link-visual') { | ||
| args.linkVisual = true; | ||
| continue; | ||
|
|
@@ -247,13 +255,21 @@ async function main() { | |
|
|
||
| let downloaded = 0; | ||
| let skipped = 0; | ||
| let seeded = 0; | ||
|
|
||
| if (!args.quiet) { | ||
| console.log(`[corpus] Source: ${corpus.source}`); | ||
| console.log(`[corpus] Destination: ${destinationRoot}`); | ||
| console.log(`[corpus] Corpus size: ${selectedDocs.length} documents`); | ||
| } | ||
|
|
||
| // Fast path for git worktrees: hardlink from the primary repo's corpus | ||
| // before reaching for R2. Downloads below still run for anything the | ||
| // primary is missing, so --force or a fresh fixture still trigger R2. | ||
| if (args.seedFromPrimary && !args.force && !args.dryRun) { | ||
| seeded = seedCorpusFromPrimary(destinationRoot, selectedDocs, { quiet: args.quiet }); | ||
| } | ||
|
|
||
| if (corpus.source === REGISTRY_KEY && corpus.registry) { | ||
| const allObjectKeys = await client.listObjects(''); | ||
| const objectKeySet = new Set(allObjectKeys.map((key) => normalizePath(key).toLowerCase())); | ||
|
|
@@ -316,6 +332,14 @@ async function main() { | |
| const { relativePath, objectKey, destinationPath } = toDownload[idx]; | ||
|
|
||
| try { | ||
| // Unlink before writing. If the destination is a hardlink to the | ||
| // primary repo's corpus (seeded above), wrangler's r2 object get | ||
| // would otherwise write through and mutate the primary's file. | ||
| try { | ||
| fs.rmSync(destinationPath, { force: true }); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Useful? React with 👍 / 👎. |
||
| } catch { | ||
| // swallow — the write below will surface any real permission issue | ||
| } | ||
| await client.getObjectToFile(objectKey, destinationPath); | ||
| downloaded += 1; | ||
| } catch (error) { | ||
|
|
@@ -379,12 +403,16 @@ async function main() { | |
|
|
||
| const elapsed = Date.now() - startedAt; | ||
| if (args.quiet) { | ||
| if (downloaded > 0) { | ||
| console.log(`[corpus] Synced ${downloaded} new document(s) in ${formatDurationMs(elapsed)}`); | ||
| if (downloaded > 0 || seeded > 0) { | ||
| const parts = []; | ||
| if (seeded > 0) parts.push(`${seeded} seeded`); | ||
| if (downloaded > 0) parts.push(`${downloaded} downloaded`); | ||
| console.log(`[corpus] Synced ${parts.join(' + ')} in ${formatDurationMs(elapsed)}`); | ||
| } | ||
| } else { | ||
| const seedPart = seeded > 0 ? `, Seeded: ${seeded}` : ''; | ||
| console.log( | ||
| `[corpus] Done. Downloaded: ${downloaded}, Skipped: ${skipped}, Missing: ${missingRegistryPaths.length}, Elapsed: ${formatDurationMs(elapsed)}`, | ||
| `[corpus] Done. Downloaded: ${downloaded}${seedPart}, Skipped: ${skipped}, Missing: ${missingRegistryPaths.length}, Elapsed: ${formatDurationMs(elapsed)}`, | ||
| ); | ||
| } | ||
| } finally { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
seedCorpusFromPrimaryruns onselectedDocsbefore the registry entries are filtered againstclient.listObjects(''). Ifregistry.jsoncontains a stale key that no longer exists in R2 but the primary checkout still has that file, this run seeds it locally, then marks it missing and prunes the registry without removing the seeded file. That leaves fresh worktrees with docs the same run has already determined are absent from the bucket; seeding should useexistingDocsonly.Useful? React with 👍 / 👎.