diff --git a/generated/architecture.md b/generated/architecture.md index a40c5def..1c3f4db0 100644 --- a/generated/architecture.md +++ b/generated/architecture.md @@ -228,7 +228,7 @@ Consumers receive an engine object and call methods on it. They never branch on **Current state:** The entire build pipeline is synchronous batch processing. Parse all files → insert all nodes → build all edges. The watcher does per-file updates but reimplements the pipeline in a simpler form. -**Problem:** For large repos (10K+ files), the user waits for the entire pipeline to complete before seeing anything. There's no progress reporting during parsing. There's no way to cancel a build mid-flight. The watcher's simplified pipeline diverges from the main build path (different code, different edge cases). +**Problem:** For large repos (10K+ files), the user waits for the entire pipeline to complete before seeing anything. There's no progress reporting during parsing. There's no way to cancel a build mid-flight. The watcher's simplified pipeline diverges from the main build path (different code, different edge cases). *(Note: two concrete edge cases — concurrent file edits causing EBUSY/EACCES during read, and symlink loops causing infinite recursion in `collectFiles` — have been fixed. `readFileSafe` retries on transient OS errors and is shared between `builder.js` and `watcher.js`. `collectFiles` tracks visited real paths to break symlink cycles.)* **Ideal architecture — event-driven pipeline:** @@ -473,6 +473,8 @@ This is a simple LRU or TTL cache that sits between the analysis layer and the r **Problem:** Bug fixes to edge building in `builder.js` must be separately applied to `watcher.js`. The watcher's edge building is simpler (no barrel resolution, simpler confidence) which means watch-mode graphs are subtly different from full-build graphs. +**Partial progress:** `readFileSafe` (exported from `builder.js`, imported by `watcher.js`) is the first shared utility between the two modules. It retries on transient OS errors (EBUSY/EACCES/EPERM) that occur when editors perform non-atomic saves, replacing bare `readFileSync` calls in both code paths. This is a small step toward the shared-stages goal. + **Ideal fix:** The pipeline architecture from point #4 eliminates this entirely. Watch mode uses the same pipeline stages, just triggered per-file instead of per-project. The `insertNodes` and `buildEdges` stages are literally the same functions. --- @@ -583,7 +585,7 @@ Consumers can only import from the documented entry points. Internal modules are | 9 | Transitive import-aware confidence | Low-Medium | Accuracy | | 14 | Query result caching | Low | Performance | | 8 | Config profiles for monorepos | Low | Feature | -| 15 | Unify watcher/builder code paths | Low | Falls out of #4 | +| 15 | Unify watcher/builder code paths | Low | Falls out of #4 (partial: `readFileSafe` shared) | Items 1–4 and 6 are foundational — they restructure the core and everything else becomes easier after them. Items 13 and 7 are the most impactful feature-level changes. Items 14–15 are natural consequences of earlier changes. diff --git a/package-lock.json b/package-lock.json index 7948d44b..f643c975 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1610,13 +1610,43 @@ ] }, "node_modules/@optave/codegraph-darwin-x64": { - "optional": true + "version": "2.2.1", + "resolved": "https://registry.npmjs.org/@optave/codegraph-darwin-x64/-/codegraph-darwin-x64-2.2.1.tgz", + "integrity": "sha512-pzKS4R3v+cOB86X+U2rGsgb4AAvAyBIK+WISimjG5i8JRb/XIFmfLYIUx1kBmRiWxtwU3rSUI0hkW6usJNISFA==", + "cpu": [ + "x64" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "darwin" + ] }, "node_modules/@optave/codegraph-linux-x64-gnu": { - "optional": true + "version": "2.2.1", + "resolved": "https://registry.npmjs.org/@optave/codegraph-linux-x64-gnu/-/codegraph-linux-x64-gnu-2.2.1.tgz", + "integrity": "sha512-EBuVlqxZpmGVSqNHyZcYksN52K4Gz76zp4H86YqQFiLkASS+SfjT4zyWz51r/pns9EflP04MYm+vd+AHLwAqQg==", + "cpu": [ + "x64" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "linux" + ] }, "node_modules/@optave/codegraph-win32-x64-msvc": { - "optional": true + "version": "2.2.1", + "resolved": "https://registry.npmjs.org/@optave/codegraph-win32-x64-msvc/-/codegraph-win32-x64-msvc-2.2.1.tgz", + "integrity": "sha512-4mc38KXAnrT1CUg5HuXcvJFXo8FT3BwWlAi1kTag8D6ZGBCVr0ijHHkTn6FMMkaPeuaNIQS/6LqkH5ew1UHiDw==", + "cpu": [ + "x64" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "win32" + ] }, "node_modules/@protobufjs/aspromise": { "version": "1.1.2", diff --git a/src/builder.js b/src/builder.js index 11f02e1e..01d10225 100644 --- a/src/builder.js +++ b/src/builder.js @@ -43,8 +43,28 @@ const BUILTIN_RECEIVERS = new Set([ 'require', ]); -export function collectFiles(dir, files = [], config = {}, directories = null) { +export function collectFiles( + dir, + files = [], + config = {}, + directories = null, + _visited = new Set(), +) { const trackDirs = directories !== null; + + // Resolve real path to detect symlink loops + let realDir; + try { + realDir = fs.realpathSync(dir); + } catch { + return trackDirs ? { files, directories } : files; + } + if (_visited.has(realDir)) { + warn(`Symlink loop detected, skipping: ${dir}`); + return trackDirs ? { files, directories } : files; + } + _visited.add(realDir); + let entries; try { entries = fs.readdirSync(dir, { withFileTypes: true }); @@ -67,7 +87,7 @@ export function collectFiles(dir, files = [], config = {}, directories = null) { const full = path.join(dir, entry.name); if (entry.isDirectory()) { - collectFiles(full, files, config, directories); + collectFiles(full, files, config, directories, _visited); } else if (EXTENSIONS.has(path.extname(entry.name))) { files.push(full); hasFiles = true; @@ -125,6 +145,28 @@ function fileStat(filePath) { } } +/** + * Read a file with retry on transient errors (EBUSY/EACCES/EPERM). + * Editors performing non-atomic saves can cause these during mid-write. + */ +const TRANSIENT_CODES = new Set(['EBUSY', 'EACCES', 'EPERM']); +const RETRY_DELAY_MS = 50; + +export function readFileSafe(filePath, retries = 2) { + for (let attempt = 0; ; attempt++) { + try { + return fs.readFileSync(filePath, 'utf-8'); + } catch (err) { + if (attempt < retries && TRANSIENT_CODES.has(err.code)) { + const end = Date.now() + RETRY_DELAY_MS; + while (Date.now() < end) {} + continue; + } + throw err; + } + } +} + /** * Determine which files have changed since last build. * Three-tier cascade: @@ -193,7 +235,7 @@ function getChangedFiles(db, allFiles, rootDir) { let content; try { - content = fs.readFileSync(absPath, 'utf-8'); + content = readFileSafe(absPath); } catch { continue; } @@ -256,7 +298,7 @@ function getChangedFiles(db, allFiles, rootDir) { for (const item of needsHash) { let content; try { - content = fs.readFileSync(item.file, 'utf-8'); + content = readFileSafe(item.file); } catch { continue; } @@ -459,7 +501,7 @@ export async function buildGraph(rootDir, opts = {}) { const absPath = path.join(rootDir, relPath); let code; try { - code = fs.readFileSync(absPath, 'utf-8'); + code = readFileSafe(absPath); } catch { code = null; } diff --git a/src/watcher.js b/src/watcher.js index 1ce3d11a..0afe05e0 100644 --- a/src/watcher.js +++ b/src/watcher.js @@ -1,5 +1,6 @@ import fs from 'node:fs'; import path from 'node:path'; +import { readFileSafe } from './builder.js'; import { EXTENSIONS, IGNORE_DIRS, normalizePath } from './constants.js'; import { initSchema, openDb } from './db.js'; import { appendJournalEntries } from './journal.js'; @@ -35,7 +36,7 @@ async function updateFile(_db, rootDir, filePath, stmts, engineOpts, cache) { let code; try { - code = fs.readFileSync(filePath, 'utf-8'); + code = readFileSafe(filePath); } catch (err) { warn(`Cannot read ${relPath}: ${err.message}`); return null;