diff --git a/.clinerules b/.clinerules index c5c6fbd..054559d 100644 --- a/.clinerules +++ b/.clinerules @@ -46,6 +46,13 @@ CodeCritique is an AI-powered code review tool that: - Avoid raw `console.log(...)` in normal code paths - Document logging-related options such as `verbose` in JSDoc and pass them through to downstream helpers +### Incremental Embeddings + +- Treat `content_hash` as the source of truth for embedding freshness. Do not rely on `mtime` for correctness, especially in CI. +- Full `embeddings:generate` scans may prune stale file and document embeddings. Partial `--files` runs must not prune unrelated embeddings. +- Project-structure embeddings and retrieval are scoped by full `project_path`, not directory basename. +- Retrieval should only use project-scoped rows whose backing files still exist. + ### Key Commands ```bash diff --git a/.cursorrules b/.cursorrules index 002d488..34a3309 100644 --- a/.cursorrules +++ b/.cursorrules @@ -53,6 +53,13 @@ CodeCritique is an AI-powered code review tool that: - Avoid raw `console.log(...)` in normal code paths - Document logging-related options such as `verbose` in JSDoc and pass them through to downstream helpers +### Incremental Embeddings + +- Treat `content_hash` as the source of truth for embedding freshness. Do not rely on `mtime` for correctness, especially in CI. +- Full `embeddings:generate` scans may prune stale file and document embeddings. Partial `--files` runs must not prune unrelated embeddings. +- Project-structure embeddings and retrieval are scoped by full `project_path`, not directory basename. +- Retrieval should only use project-scoped rows whose backing files still exist. + ### Key Commands ```bash diff --git a/.roo/rules/01-project-rules.md b/.roo/rules/01-project-rules.md index 86adfea..e180550 100644 --- a/.roo/rules/01-project-rules.md +++ b/.roo/rules/01-project-rules.md @@ -43,6 +43,13 @@ For comprehensive guidelines, see **AGENTS.md** in the project root. - Avoid raw `console.log(...)` in normal code paths - Document logging-related options such as `verbose` in JSDoc and pass them through to downstream helpers +### Incremental Embeddings + +- Treat `content_hash` as the source of truth for embedding freshness. Do not rely on `mtime` for correctness, especially in CI. +- Full `embeddings:generate` scans may prune stale file and document embeddings. Partial `--files` runs must not prune unrelated embeddings. +- Project-structure embeddings and retrieval are scoped by full `project_path`, not directory basename. +- Retrieval should only use project-scoped rows whose backing files still exist. + ### Key Commands ```bash diff --git a/AGENTS.md b/AGENTS.md index 5136abb..b7f14c2 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -68,6 +68,16 @@ This document provides comprehensive guidance for AI coding agents working on th 2. **Analyze PR History** (optional): Fetch PR comments from GitHub → extract patterns → store for context 3. **Code Review**: Retrieve similar code examples + documentation → provide context to LLM → generate review +### Incremental Embeddings Rules + +- Treat `content_hash` as the source of truth for embedding freshness. Do not rely on `mtime` to decide whether embeddings are current, especially in CI. +- `embeddings:generate` has two modes: + - Full directory scans update changed files and prune stale embeddings for deleted files and documents. + - Partial `--files` runs update only the requested files and must not prune unrelated embeddings. +- Project-structure embeddings are scoped by full `project_path`, not by directory basename. +- Retrieval should only return project-scoped context and should validate that referenced files still exist before using stored content. +- Project summaries should be refreshed when the embedding inventory changes, not only when previously tracked key files change. + --- ## Directory Structure @@ -114,6 +124,7 @@ src/ │ ├── logging.js # Logging utilities │ ├── markdown.js # Markdown processing │ ├── mobilebert-tokenizer.js # Tokenization utilities +│ ├── path-utils.js # Project path boundary helpers │ ├── pr-chunking.js # PR chunking for large diffs │ └── string-utils.js # String manipulation utilities │ diff --git a/CLAUDE.md b/CLAUDE.md index f3017a7..79d4939 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -54,6 +54,13 @@ CodeCritique is an AI-powered code review tool that: - Avoid raw `console.log(...)` in normal code paths - Document logging-related options such as `verbose` in JSDoc and pass them through to downstream helpers +### Incremental Embeddings + +- Treat `content_hash` as the source of truth for embedding freshness. Do not rely on `mtime` for correctness, especially in CI. +- Full `embeddings:generate` scans may prune stale file and document embeddings. Partial `--files` runs must not prune unrelated embeddings. +- Project-structure embeddings and retrieval are scoped by full `project_path`, not directory basename. +- Retrieval should only use project-scoped rows whose backing files still exist. + ### Key Commands ```bash diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8168032..f1f6148 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -103,6 +103,7 @@ src/ ├── utils/ # Utility functions │ ├── git.js # Git operations │ ├── file-validation.js # File validation +│ ├── path-utils.js # Project path boundary helpers │ └── ... └── test-utils/ # Test utilities and fixtures .github/ diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 3d5b0df..5be73b2 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -33,6 +33,18 @@ Additionally, [Hugging Face Transformers.js](https://github.com/huggingface/tran [LanceDB](https://lancedb.com/) stores embeddings for fast similarity search. The vector database maintains an index of all embedded code examples, documentation, and review patterns, enabling efficient retrieval of relevant context. LanceDB's columnar storage format provides excellent performance for similarity searches across large codebases. It supports hybrid search combining vector similarity and full-text search with Reciprocal Rank Fusion (RRF) for optimal results. +### Incremental Embedding Generation + +Embedding generation is incremental and project-scoped. The system stores file embeddings, documentation chunk embeddings, and project-structure embeddings with a `project_path` so multiple repositories can share the same local LanceDB instance without mixing context. + +Freshness is determined by `content_hash`, not by file modification time. This is important in CI environments where checkout operations can rewrite `mtime` values even when file contents are unchanged. During generation: + +- Full `embeddings:generate` directory scans update new or changed files and prune stale embeddings for deleted files and removed documentation. +- Partial `embeddings:generate --files ...` runs update only the requested files and do not prune unrelated embeddings. +- Project-structure embeddings are regenerated only when the structure content changes and are keyed by full project identity rather than directory basename. + +This design keeps review context current without forcing full regeneration on every run. + ### Context Retrieval Finds relevant code examples, documentation, and historical patterns. When reviewing code, the system queries the vector database to retrieve: @@ -42,6 +54,8 @@ Finds relevant code examples, documentation, and historical patterns. When revie - Historical review patterns from past pull requests that show how similar issues were addressed - Custom guidelines and rules specific to your project +Retrieval is also defensive about freshness and project isolation. Results are filtered by `project_path` when available, validated against project path boundaries, and checked for file existence before stored content is used in review prompts. This prevents stale embeddings from deleted files and cross-project collisions from leaking into code review context. + ### LLM Integration [Anthropic Claude](https://www.anthropic.com/) (claude-sonnet-4-5 by default) analyzes code with rich contextual information. The retrieved context is combined with the code being reviewed and sent to Claude, which performs a comprehensive analysis considering: diff --git a/src/content-retrieval.js b/src/content-retrieval.js index c849b58..ccc5cb6 100644 --- a/src/content-retrieval.js +++ b/src/content-retrieval.js @@ -24,6 +24,8 @@ import { inferContextFromDocumentContent } from './utils/context-inference.js'; import { isGenericDocument, getGenericDocumentContext } from './utils/document-detection.js'; import { isDocumentationFile } from './utils/file-validation.js'; import { debug, verboseLog } from './utils/logging.js'; +import { isPathWithinProject } from './utils/path-utils.js'; +import { escapeSqlString } from './utils/string-utils.js'; const FILE_EMBEDDINGS_TABLE = TABLE_NAMES.FILE_EMBEDDINGS; const DOCUMENT_CHUNK_TABLE = TABLE_NAMES.DOCUMENT_CHUNK; @@ -54,6 +56,58 @@ export class ContentRetriever { this.cleaningUp = false; } + resolveProjectResultPath(filePath, resolvedProjectPath) { + if (!filePath) { + return null; + } + + const absolutePath = path.isAbsolute(filePath) ? path.resolve(filePath) : path.resolve(resolvedProjectPath, filePath); + return isPathWithinProject(absolutePath, resolvedProjectPath) ? absolutePath : null; + } + + async filterResultsForProject(results, resolvedProjectPath, getPath) { + const resultsToCheck = []; + const projectMatchMap = new Map(); + + for (let i = 0; i < results.length; i++) { + const result = results[i]; + const resultPath = getPath(result); + + if (result.project_path && result.project_path !== resolvedProjectPath) { + projectMatchMap.set(i, false); + continue; + } + + const absolutePath = this.resolveProjectResultPath(resultPath, resolvedProjectPath); + if (!absolutePath) { + projectMatchMap.set(i, false); + continue; + } + + resultsToCheck.push({ index: i, absolutePath, resultPath }); + } + + if (resultsToCheck.length > 0) { + const existenceResults = await Promise.all( + resultsToCheck.map(async ({ index, absolutePath, resultPath }) => { + try { + await fs.promises.access(absolutePath, fs.constants.F_OK); + return { index, exists: true }; + } catch { + debug(`Filtering out non-existent project file: ${resultPath}`); + return { index, exists: false }; + } + }) + ); + + for (const { index, exists } of existenceResults) { + projectMatchMap.set(index, exists); + } + } + + return results.filter((result, index) => projectMatchMap.get(index) === true); + } + /** * Find relevant documentation with sophisticated reranking * @param {string} queryText - The search query @@ -99,7 +153,7 @@ export class ContentRetriever { try { const tableSchema = await table.schema; if (tableSchema?.fields?.some((field) => field.name === 'project_path')) { - query = query.where(`project_path = '${resolvedProjectPath.replace(/'/g, "''")}'`); + query = query.where(`project_path = '${escapeSqlString(resolvedProjectPath)}'`); debug(`Filtering documentation by project_path: ${resolvedProjectPath}`); } } catch (schemaError) { @@ -109,65 +163,11 @@ export class ContentRetriever { const results = await query.limit(Math.max(limit * 3, 20)).toArray(); verboseLog(options, chalk.green(`Native hybrid search returned ${results.length} documentation results`)); - // OPTIMIZATION: Enhanced batch file existence checks with parallel processing - const docsToCheck = []; - const docProjectMatchMap = new Map(); - - // First pass: collect files that need existence checking - for (let i = 0; i < results.length; i++) { - const result = results[i]; - - if (result.project_path) { - docProjectMatchMap.set(i, result.project_path === resolvedProjectPath); - continue; - } - - if (!result.original_document_path) { - docProjectMatchMap.set(i, false); - continue; - } - - const filePath = result.original_document_path; - try { - if (path.isAbsolute(filePath)) { - docProjectMatchMap.set(i, filePath.startsWith(resolvedProjectPath)); - continue; - } - - const absolutePath = path.resolve(resolvedProjectPath, filePath); - if (absolutePath.startsWith(resolvedProjectPath)) { - // Mark for batch existence check - docsToCheck.push({ result, index: i, absolutePath, filePath }); - } else { - docProjectMatchMap.set(i, false); - } - } catch (error) { - debug(`Error filtering result for project: ${error.message}`); - docProjectMatchMap.set(i, false); - } - } - - // Enhanced batch check file existence with improved error handling - if (docsToCheck.length > 0) { - debug(`[OPTIMIZATION] Batch checking existence of ${docsToCheck.length} documentation files`); - const existencePromises = docsToCheck.map(async ({ index, absolutePath, filePath }) => { - try { - await fs.promises.access(absolutePath, fs.constants.F_OK); - return { index, exists: true }; - } catch { - debug(`Filtering out non-existent documentation file: ${filePath}`); - return { index, exists: false }; - } - }); - - const existenceResults = await Promise.all(existencePromises); - for (const { index, exists } of existenceResults) { - docProjectMatchMap.set(index, exists); - } - } - - // Filter results based on project match using the map - const projectFilteredResults = results.filter((result, index) => docProjectMatchMap.get(index) === true); + const projectFilteredResults = await this.filterResultsForProject( + results, + resolvedProjectPath, + (result) => result.original_document_path + ); verboseLog(options, chalk.blue(`Filtered to ${projectFilteredResults.length} documentation results from current project`)); let finalResults = projectFilteredResults.map((result) => { @@ -493,13 +493,13 @@ export class ContentRetriever { if (queryFilePath) { const normalizedQueryPath = path.resolve(resolvedProjectPath, queryFilePath); // Add condition to exclude the file being reviewed - const escapedPath = normalizedQueryPath.replace(/'/g, "''"); + const escapedPath = escapeSqlString(normalizedQueryPath); conditions.push(`path != '${escapedPath}'`); // Also check for relative path variants to be thorough const relativePath = path.relative(resolvedProjectPath, normalizedQueryPath); if (relativePath && !relativePath.startsWith('..')) { - const escapedRelativePath = relativePath.replace(/'/g, "''"); + const escapedRelativePath = escapeSqlString(relativePath); conditions.push(`path != '${escapedRelativePath}'`); } @@ -515,7 +515,7 @@ export class ContentRetriever { if (hasProjectPathField) { // Use exact match for project path - conditions.push(`project_path = '${resolvedProjectPath.replace(/'/g, "''")}'`); + conditions.push(`project_path = '${escapeSqlString(resolvedProjectPath)}'`); debug(`Filtering by project_path: ${resolvedProjectPath}`); } } @@ -532,72 +532,11 @@ export class ContentRetriever { verboseLog(options, chalk.green(`Native hybrid search returned ${results.length} results`)); - // OPTIMIZATION: Batch file existence checks for better performance - const resultsToCheck = []; - const projectMatchMap = new Map(); - - // First pass: collect files that need existence checking - for (let i = 0; i < results.length; i++) { - const result = results[i]; - - // Use project_path field if available (new schema) - if (result.project_path) { - projectMatchMap.set(i, result.project_path === resolvedProjectPath); - continue; - } - - // Fallback for old embeddings without project_path field - if (!result.path && !result.original_document_path) { - projectMatchMap.set(i, false); - continue; - } - - const filePath = result.original_document_path || result.path; - try { - // Check if this result belongs to the current project - // First try as absolute path - if (path.isAbsolute(filePath)) { - projectMatchMap.set(i, filePath.startsWith(resolvedProjectPath)); - continue; - } - - // For relative paths, check if the file actually exists in the project - const absolutePath = path.resolve(resolvedProjectPath, filePath); - - // Verify the path is within project bounds - if (absolutePath.startsWith(resolvedProjectPath)) { - // Mark for batch existence check - resultsToCheck.push({ result, index: i, absolutePath }); - } else { - projectMatchMap.set(i, false); - } - } catch (error) { - debug(`Error filtering result for project: ${error.message}`); - projectMatchMap.set(i, false); - } - } - - // Batch check file existence for better performance - if (resultsToCheck.length > 0) { - debug(`[OPTIMIZATION] Batch checking existence of ${resultsToCheck.length} files`); - const existencePromises = resultsToCheck.map(async ({ result, index, absolutePath }) => { - try { - await fs.promises.access(absolutePath, fs.constants.F_OK); - return { index, exists: true }; - } catch { - debug(`Filtering out non-existent file: ${result.original_document_path || result.path}`); - return { index, exists: false }; - } - }); - - const existenceResults = await Promise.all(existencePromises); - for (const { index, exists } of existenceResults) { - projectMatchMap.set(index, exists); - } - } - - // Filter results based on project match using the map - const projectFilteredResults = results.filter((result, index) => projectMatchMap.get(index) === true); + const projectFilteredResults = await this.filterResultsForProject( + results, + resolvedProjectPath, + (result) => result.original_document_path || result.path + ); verboseLog(options, chalk.blue(`Filtered to ${projectFilteredResults.length} results from current project`)); @@ -643,14 +582,11 @@ export class ContentRetriever { try { const fileTable = await this.database.getTable(FILE_EMBEDDINGS_TABLE); if (fileTable) { - // Look for project-specific structure ID - const projectStructureId = `__project_structure__${path.basename(resolvedProjectPath)}`; - let structureResults = await fileTable.query().where(`id = '${projectStructureId}'`).limit(1).toArray(); - - // Fall back to generic project structure if project-specific one doesn't exist - if (structureResults.length === 0) { - structureResults = await fileTable.query().where("id = '__project_structure__'").limit(1).toArray(); - } + const structureResults = await fileTable + .query() + .where(`project_path = '${escapeSqlString(resolvedProjectPath)}' AND type = 'directory-structure'`) + .limit(1) + .toArray(); if (structureResults.length > 0) { const structureRecord = structureResults[0]; diff --git a/src/content-retrieval.test.js b/src/content-retrieval.test.js index e03c4d0..96636f3 100644 --- a/src/content-retrieval.test.js +++ b/src/content-retrieval.test.js @@ -459,6 +459,14 @@ describe('ContentRetriever', () => { const results = await retriever.findRelevantDocs('query', { projectPath: '/project' }); expect(results.length).toBe(0); }); + + it('should reject sibling project absolute paths for documentation', async () => { + mockTable.toArray.mockResolvedValue([ + createMockDocResult({ project_path: null, original_document_path: '/project-old/docs/readme.md' }), + ]); + const results = await retriever.findRelevantDocs('query', { projectPath: '/project' }); + expect(results).toHaveLength(0); + }); }); // ========================================================================== @@ -498,6 +506,12 @@ describe('ContentRetriever', () => { expect(results.length).toBe(0); }); + it('should reject sibling project absolute paths for code results', async () => { + mockTable.toArray.mockResolvedValue([createMockCodeResult({ project_path: null, path: '/project-old/src/file.js' })]); + const results = await retriever.findSimilarCode('query', { projectPath: '/project', similarityThreshold: 0 }); + expect(results).toHaveLength(0); + }); + it('should handle schema check errors', async () => { mockTable.schema = null; mockTable.toArray.mockResolvedValue([createMockCodeResult()]); @@ -511,20 +525,31 @@ describe('ContentRetriever', () => { // ========================================================================== describe('project structure inclusion', () => { - it('should fall back to generic project structure', async () => { + it('should include only project-scoped structure rows', async () => { mockTable.toArray.mockResolvedValue([createMockCodeResult()]); - mockTable.query.mockReturnValue({ + const queryChain = { where: vi.fn().mockReturnThis(), limit: vi.fn().mockReturnThis(), - toArray: vi - .fn() - .mockResolvedValueOnce([]) - .mockResolvedValueOnce([ - { id: '__project_structure__', content: 'Generic structure', path: '.', vector: new Float32Array(384).fill(0.1) }, - ]), + toArray: vi.fn().mockResolvedValue([ + { + id: '__project_structure__#abc12345', + content: 'Project structure', + path: '.', + project_path: '/project', + type: 'directory-structure', + vector: new Float32Array(384).fill(0.1), + }, + ]), + }; + mockTable.query.mockReturnValue(queryChain); + const results = await retriever.findSimilarCode('query', { + includeProjectStructure: true, + similarityThreshold: 0, + projectPath: '/project', }); - const results = await retriever.findSimilarCode('query', { includeProjectStructure: true, similarityThreshold: 0 }); expect(results.some((r) => r.type === 'project-structure')).toBe(true); + expect(queryChain.where).toHaveBeenCalledWith(expect.stringContaining("type = 'directory-structure'")); + expect(queryChain.where).toHaveBeenCalledWith(expect.stringContaining("project_path = '/project'")); }); it('should handle project structure inclusion errors', async () => { @@ -539,6 +564,21 @@ describe('ContentRetriever', () => { expect(console.warn).toHaveBeenCalledWith(expect.stringContaining('Project structure inclusion failed')); }); + it('should skip project structure rows from another project', async () => { + mockTable.toArray.mockResolvedValue([createMockCodeResult()]); + mockTable.query.mockReturnValue({ + where: vi.fn().mockReturnThis(), + limit: vi.fn().mockReturnThis(), + toArray: vi.fn().mockResolvedValue([]), + }); + const results = await retriever.findSimilarCode('query', { + includeProjectStructure: true, + similarityThreshold: 0, + projectPath: '/project', + }); + expect(results.some((r) => r.type === 'project-structure')).toBe(false); + }); + it('should skip structure when similarity is too low', async () => { mockTable.toArray.mockResolvedValue([createMockCodeResult()]); mockTable.query.mockReturnValue({ diff --git a/src/embeddings/database.js b/src/embeddings/database.js index 3008843..e5c3e9e 100644 --- a/src/embeddings/database.js +++ b/src/embeddings/database.js @@ -24,6 +24,8 @@ import * as lancedb from '@lancedb/lancedb'; import { Field, FixedSizeList, Float32, Int32, Schema, Utf8 } from 'apache-arrow'; import chalk from 'chalk'; import { debug, verboseLog } from '../utils/logging.js'; +import { isPathWithinProject } from '../utils/path-utils.js'; +import { escapeSqlString } from '../utils/string-utils.js'; import { EMBEDDING_DIMENSIONS, TABLE_NAMES } from './constants.js'; import { LANCEDB_PATH } from './constants.js'; import { createDatabaseError, ERROR_CODES } from './errors.js'; @@ -467,7 +469,6 @@ export class DatabaseManager { let db = null; try { const resolvedProjectPath = path.resolve(projectPath); - const projectName = path.basename(resolvedProjectPath); // Safety check: ensure project path is valid and not root if (!resolvedProjectPath || resolvedProjectPath === '/' || resolvedProjectPath === path.resolve('/')) { @@ -480,7 +481,7 @@ export class DatabaseManager { throw new Error(`Project path too generic: ${resolvedProjectPath}. For safety, project must be at least 3 levels deep.`); } - verboseLog({}, chalk.cyan(`Clearing embeddings for project: ${resolvedProjectPath} (${projectName})`)); + verboseLog({}, chalk.cyan(`Clearing embeddings for project: ${resolvedProjectPath}`)); if (!fs.existsSync(this.dbPath)) { verboseLog({}, chalk.yellow('LanceDB directory does not exist, nothing to clear.')); @@ -495,27 +496,21 @@ export class DatabaseManager { if (tableNames.includes(this.fileEmbeddingsTable)) { const fileTable = await db.openTable(this.fileEmbeddingsTable); await this._validateTableHasProjectPath(fileTable, this.fileEmbeddingsTable); - deletedCount += await this._clearProjectTableRecords( - db, - this.fileEmbeddingsTable, - resolvedProjectPath, - projectName, - 'project_path' - ); + deletedCount += await this._clearProjectTableRecords(db, this.fileEmbeddingsTable, resolvedProjectPath, 'project_path'); } // Clear document chunk embeddings for this project if (tableNames.includes(this.documentChunkTable)) { const docTable = await db.openTable(this.documentChunkTable); await this._validateTableHasProjectPath(docTable, this.documentChunkTable); - deletedCount += await this._clearProjectTableRecords(db, this.documentChunkTable, resolvedProjectPath, projectName, 'project_path'); + deletedCount += await this._clearProjectTableRecords(db, this.documentChunkTable, resolvedProjectPath, 'project_path'); } // Clear project summaries for this project if (tableNames.includes(PROJECT_SUMMARIES_TABLE)) { const summariesTable = await db.openTable(PROJECT_SUMMARIES_TABLE); await this._validateTableHasProjectPath(summariesTable, PROJECT_SUMMARIES_TABLE); - deletedCount += await this._clearProjectTableRecords(db, PROJECT_SUMMARIES_TABLE, resolvedProjectPath, projectName, 'project_path'); + deletedCount += await this._clearProjectTableRecords(db, PROJECT_SUMMARIES_TABLE, resolvedProjectPath, 'project_path'); } // Note: PR comments are cleared via separate pr-history:clear command @@ -545,6 +540,82 @@ export class DatabaseManager { } } + /** + * Prune file embeddings whose paths are no longer live for a project. + * @param {string} projectPath - Project path + * @param {Iterable} livePaths - Paths that should remain + * @returns {Promise} Number of deleted records + */ + async pruneProjectFileEmbeddings(projectPath, livePaths = []) { + const resolvedProjectPath = path.resolve(projectPath); + const table = await this.getTable(this.fileEmbeddingsTable); + if (!table) { + return 0; + } + + const livePathSet = new Set(Array.from(livePaths)); + const records = await table + .query() + .where(`project_path = '${escapeSqlString(resolvedProjectPath)}'`) + .toArray(); + let deletedCount = 0; + + for (const record of records) { + if (record.type === 'directory-structure') { + continue; + } + + if (!record.path || livePathSet.has(record.path)) { + continue; + } + + try { + await table.delete(`id = '${escapeSqlString(record.id)}'`); + deletedCount++; + } catch (deleteError) { + console.warn(chalk.yellow(`Warning: Could not prune file embedding ${record.id}: ${deleteError.message}`)); + } + } + + return deletedCount; + } + + /** + * Prune document chunk embeddings whose source docs are no longer live for a project. + * @param {string} projectPath - Project path + * @param {Iterable} liveDocumentPaths - Document paths that should remain + * @returns {Promise} Number of deleted records + */ + async pruneProjectDocumentChunks(projectPath, liveDocumentPaths = []) { + const resolvedProjectPath = path.resolve(projectPath); + const table = await this.getTable(this.documentChunkTable); + if (!table) { + return 0; + } + + const liveDocumentPathSet = new Set(Array.from(liveDocumentPaths)); + const records = await table + .query() + .where(`project_path = '${escapeSqlString(resolvedProjectPath)}'`) + .toArray(); + let deletedCount = 0; + + for (const record of records) { + if (!record.original_document_path || liveDocumentPathSet.has(record.original_document_path)) { + continue; + } + + try { + await table.delete(`id = '${escapeSqlString(record.id)}'`); + deletedCount++; + } catch (deleteError) { + console.warn(chalk.yellow(`Warning: Could not prune document chunk ${record.id}: ${deleteError.message}`)); + } + } + + return deletedCount; + } + // ============================================================================ // PRIVATE METHODS // ============================================================================ @@ -673,32 +744,24 @@ export class DatabaseManager { * @param {LanceDBConnection} db - Database connection * @param {string} tableName - Table name * @param {string} resolvedProjectPath - Resolved project path - * @param {string} projectName - Project name * @param {string} pathField - Path field name * @returns {Promise} Number of deleted records * @private */ - async _clearProjectTableRecords(db, tableName, resolvedProjectPath, projectName, pathField) { + async _clearProjectTableRecords(db, tableName, resolvedProjectPath, pathField) { const table = await db.openTable(tableName); const allRecords = await table.query().toArray(); const projectRecords = allRecords.filter((record) => { if (!record[pathField]) return false; - // Check for project-specific structure - if (record.id === `__project_structure__${projectName}` || record.id === '__project_structure__') { - return true; - } - // Check if this record belongs to the current project try { if (pathField === 'project_path') { - // For project_path field, do direct equality check return record[pathField] === resolvedProjectPath; } else { - // For other path fields (like 'path'), resolve relative to project path const absolutePath = path.resolve(resolvedProjectPath, record[pathField]); - return absolutePath.startsWith(resolvedProjectPath); + return isPathWithinProject(absolutePath, resolvedProjectPath); } } catch { return false; @@ -711,7 +774,7 @@ export class DatabaseManager { let deletedCount = 0; for (const record of projectRecords) { try { - await table.delete(`id = '${record.id.replace(/'/g, "''")}'`); + await table.delete(`id = '${escapeSqlString(record.id)}'`); deletedCount++; } catch (deleteError) { console.warn(chalk.yellow(`Warning: Could not delete record ${record.id}: ${deleteError.message}`)); diff --git a/src/embeddings/database.test.js b/src/embeddings/database.test.js index 7513dc3..e989af3 100644 --- a/src/embeddings/database.test.js +++ b/src/embeddings/database.test.js @@ -353,6 +353,49 @@ describe('DatabaseManager', () => { mockDb.tableNames.mockRejectedValue(new Error('DB error')); await expect(dbManager.clearProjectEmbeddings('/test/project/deep')).rejects.toThrow('DB error'); }); + + it('should not delete same-basename structure rows from another project', async () => { + mockDb.tableNames.mockResolvedValue(['file_embeddings']); + setupQuery([ + { id: '__project_structure__#current', project_path: '/test/project/deep' }, + { id: '__project_structure__#other', project_path: '/other/project/deep' }, + ]); + await dbManager.clearProjectEmbeddings('/test/project/deep'); + expect(mockTable.delete).toHaveBeenCalledTimes(1); + expect(mockTable.delete).toHaveBeenCalledWith(expect.stringContaining('__project_structure__#current')); + }); + }); + + describe('prune project embeddings', () => { + it('should prune stale file embeddings only for missing live paths', async () => { + mockDb.tableNames.mockResolvedValue(['file_embeddings']); + mockTable.query.mockReturnValue({ + where: vi.fn().mockReturnThis(), + toArray: vi.fn().mockResolvedValue([ + { id: 'keep-file', path: 'src/keep.js', type: 'file', project_path: '/test/project/deep' }, + { id: 'drop-file', path: 'src/drop.js', type: 'file', project_path: '/test/project/deep' }, + { id: 'structure', path: '.', type: 'directory-structure', project_path: '/test/project/deep' }, + ]), + }); + const deleted = await dbManager.pruneProjectFileEmbeddings('/test/project/deep', new Set(['src/keep.js'])); + expect(deleted).toBe(1); + expect(mockTable.delete).toHaveBeenCalledWith(expect.stringContaining('drop-file')); + expect(mockTable.delete).not.toHaveBeenCalledWith(expect.stringContaining('structure')); + }); + + it('should prune stale document chunks only for missing live docs', async () => { + mockDb.tableNames.mockResolvedValue(['document_chunk_embeddings']); + mockTable.query.mockReturnValue({ + where: vi.fn().mockReturnThis(), + toArray: vi.fn().mockResolvedValue([ + { id: 'keep-doc', original_document_path: 'docs/keep.md', project_path: '/test/project/deep' }, + { id: 'drop-doc', original_document_path: 'docs/drop.md', project_path: '/test/project/deep' }, + ]), + }); + const deleted = await dbManager.pruneProjectDocumentChunks('/test/project/deep', new Set(['docs/keep.md'])); + expect(deleted).toBe(1); + expect(mockTable.delete).toHaveBeenCalledWith(expect.stringContaining('drop-doc')); + }); }); // ========================================================================== diff --git a/src/embeddings/file-processor.js b/src/embeddings/file-processor.js index 6e5c384..5ee8b56 100644 --- a/src/embeddings/file-processor.js +++ b/src/embeddings/file-processor.js @@ -21,10 +21,32 @@ import { isDocumentationFile, shouldProcessFile as utilsShouldProcessFile, batch import { detectLanguageFromExtension } from '../utils/language-detection.js'; import { debug, verboseLog } from '../utils/logging.js'; import { extractMarkdownChunks } from '../utils/markdown.js'; -import { slugify } from '../utils/string-utils.js'; +import { escapeSqlString, slugify } from '../utils/string-utils.js'; import { TABLE_NAMES, LANCEDB_DIR_NAME, FASTEMBED_CACHE_DIR_NAME } from './constants.js'; import { createFileProcessingError } from './errors.js'; +const FILE_EMBEDDING_BATCH_SIZE = 50; +const DIRECTORY_STRUCTURE_TYPE = 'directory-structure'; + +function createShortHash(content) { + return createHash('md5').update(content).digest('hex').substring(0, 8); +} + +function createProjectStructureId(projectPath) { + return `__project_structure__#${createShortHash(path.resolve(projectPath))}`; +} + +function buildDocumentChunkId(originalDocumentPath, heading, startLineInDoc) { + return `${originalDocumentPath}#${slugify(heading || 'section')}_${startLineInDoc}`; +} + +function buildExistingDocumentSignature(chunks) { + return chunks + .map((chunk) => `${chunk.id}:${chunk.content_hash}`) + .sort() + .join('|'); +} + // ============================================================================ // FILE PROCESSOR CLASS // ============================================================================ @@ -165,26 +187,43 @@ export class FileProcessor { throw new Error(`[generateDirEmb] Table ${this.fileEmbeddingsTable} not found.`); } - // Create project-specific structure ID based on the root directory const rootDir = options.rootDir || process.cwd(); const projectName = path.basename(path.resolve(rootDir)); - const structureId = `__project_structure__${projectName}`; - - try { - await table.delete(`id = '${structureId}'`); - debug('[generateDirEmb] Deleted existing project structure embedding'); - } catch (error) { - if (!error.message.includes('Record not found') && !error.message.includes('cannot find')) { - debug(`[generateDirEmb] Error deleting existing project structure: ${error.message}`); - } else { - debug('[generateDirEmb] No existing project structure to delete.'); - } - } + const resolvedRootDir = path.resolve(rootDir); + const structureId = createProjectStructureId(resolvedRootDir); const directoryStructure = this.generateDirectoryStructure(options); if (!directoryStructure) throw new Error('[generateDirEmb] Failed to generate directory structure string'); debug('[generateDirEmb] Directory structure string generated.'); + const directoryStructureHash = createShortHash(directoryStructure); + let existingStructureRecords = []; + try { + existingStructureRecords = await table + .query() + .where(`project_path = '${escapeSqlString(resolvedRootDir)}' AND type = '${DIRECTORY_STRUCTURE_TYPE}'`) + .toArray(); + } catch (queryError) { + debug(`[generateDirEmb] Could not query existing project structure embeddings: ${queryError.message}`); + } + + const matchingStructure = existingStructureRecords.find((record) => record.content_hash === directoryStructureHash); + if (matchingStructure) { + debug('[generateDirEmb] Directory structure unchanged, skipping regeneration.'); + return true; + } + + for (const existingRecord of existingStructureRecords) { + try { + await table.delete(`id = '${escapeSqlString(existingRecord.id)}'`); + debug(`[generateDirEmb] Deleted stale project structure embedding: ${existingRecord.id}`); + } catch (error) { + if (!error.message.includes('Record not found') && !error.message.includes('cannot find')) { + debug(`[generateDirEmb] Error deleting existing project structure: ${error.message}`); + } + } + } + // *** Calculate embedding explicitly *** const embedding = await this.modelManager.calculateEmbedding(directoryStructure); @@ -198,13 +237,13 @@ export class FileProcessor { vector: embedding, // Include calculated embedding id: structureId, content: directoryStructure, - type: 'directory-structure', + type: DIRECTORY_STRUCTURE_TYPE, name: `${projectName} Project Structure`, - path: `${projectName} Project Structure`, // Project-specific path - project_path: path.resolve(rootDir), // Add project path for consistency with new schema + path: `${projectName} Project Structure`, + project_path: resolvedRootDir, language: 'text', - content_hash: createHash('md5').update(directoryStructure).digest('hex').substring(0, 8), - last_modified: new Date().toISOString(), // Use current timestamp for directory structure + content_hash: directoryStructureHash, + last_modified: new Date().toISOString(), }; debug(`[generateDirEmb] Prepared record: ID=${record.id}, Vector length=${record.vector?.length}`); @@ -245,7 +284,9 @@ export class FileProcessor { respectGitignore = true, baseDir: optionBaseDir = process.cwd(), maxLines = 1000, - onProgress, // <<< Add onProgress here + batchSize = FILE_EMBEDDING_BATCH_SIZE, + onProgress, + runMode = 'full', } = options; const resolvedCanonicalBaseDir = path.resolve(optionBaseDir); debug(`Resolved canonical base directory: ${resolvedCanonicalBaseDir}`); @@ -280,6 +321,8 @@ export class FileProcessor { this.progressTracker.reset(filePaths.length); verboseLog(options, chalk.blue(`Starting batch processing of ${filePaths.length} files...`)); + const sharedState = await this._createSharedProcessingState(filePaths, resolvedCanonicalBaseDir, exclusionOptions, options); + // Generate directory structure embedding first try { await this.generateDirectoryStructureEmbedding({ @@ -287,13 +330,13 @@ export class FileProcessor { maxDepth: 5, ignorePatterns: excludePatterns, showFiles: true, + verbose: options.verbose, }); } catch (structureError) { console.warn(chalk.yellow(`Warning: Failed to generate directory structure embedding: ${structureError.message}`)); } - const fileTable = await this.databaseManager.getTable(this.fileEmbeddingsTable); - if (!fileTable) { + if (!sharedState.fileTable) { console.error(chalk.red(`Table ${this.fileEmbeddingsTable} not found. Aborting batch file embedding.`)); results.failed = filePaths.length; results.failedFiles = [...filePaths]; @@ -302,33 +345,23 @@ export class FileProcessor { return results; } - // Process files in batches - verboseLog(options, chalk.cyan('--- Starting Phase 1: File Embeddings ---')); - const BATCH_SIZE = 50; // Process files in smaller batches for better performance - - for (let i = 0; i < filePaths.length; i += BATCH_SIZE) { - const batch = filePaths.slice(i, i + BATCH_SIZE); - const batchResults = await this._processBatch( - batch, - resolvedCanonicalBaseDir, - exclusionOptions, - onProgress, - maxLines, - options.verbose - ); + const candidates = await this._prepareCandidateFiles(filePaths, exclusionOptions, sharedState, results, onProgress); - // Merge results - results.processed += batchResults.processed; - results.failed += batchResults.failed; - results.skipped += batchResults.skipped; - results.excluded += batchResults.excluded; - results.files.push(...batchResults.files); - results.failedFiles.push(...batchResults.failedFiles); - results.excludedFiles.push(...batchResults.excludedFiles); - } + verboseLog(options, chalk.cyan('--- Starting Phase 1: File Embeddings ---')); + await this._processFileEmbeddings(candidates, sharedState, { + batchSize, + maxLines, + onProgress, + results, + verbose: options.verbose, + }); // Process document chunks - await this._processDocumentChunks(filePaths, resolvedCanonicalBaseDir, excludePatterns); + await this._processDocumentChunks(candidates, sharedState, options.verbose); + + if (runMode === 'full') { + await this._pruneStaleEmbeddings(sharedState, options.verbose); + } verboseLog(options, chalk.green(`Batch processing complete!`)); @@ -353,69 +386,81 @@ export class FileProcessor { * @returns {Promise} Batch processing results * @private */ - async _processBatch(filePaths, baseDir, exclusionOptions, onProgress, maxLines = 1000, verbose = false) { - const results = { processed: 0, failed: 0, skipped: 0, excluded: 0, files: [], failedFiles: [], excludedFiles: [] }; + async _createSharedProcessingState(filePaths, baseDir, exclusionOptions, options = {}) { + const absoluteFilePaths = filePaths.map((filePath) => + path.isAbsolute(filePath) ? path.resolve(filePath) : path.resolve(baseDir, filePath) + ); + const gitignoreCache = + exclusionOptions.respectGitignore !== false + ? await batchCheckGitignore(absoluteFilePaths, baseDir, { verbose: options.verbose }) + : new Map(); - // ============================================================================ - // PHASE 1: BATCH GITIGNORE CHECK - // ============================================================================ - let gitignoreCache = new Map(); - if (exclusionOptions.respectGitignore !== false) { - verboseLog({}, chalk.cyan(`Performing batch gitignore check for ${filePaths.length} files...`)); - const gitStartTime = Date.now(); - const absoluteFilePaths = filePaths.map((fp) => (path.isAbsolute(fp) ? path.resolve(fp) : path.resolve(baseDir, fp))); - gitignoreCache = await batchCheckGitignore(absoluteFilePaths, baseDir, { verbose }); - const gitDuration = ((Date.now() - gitStartTime) / 1000).toFixed(2); - verboseLog({}, chalk.green(`✓ Batch gitignore check completed in ${gitDuration}s`)); - } - - // ============================================================================ - // PHASE 2: GET EXISTING EMBEDDINGS (for early filtering) - // ============================================================================ - // Query existing embeddings BEFORE reading any files const fileTable = await this.databaseManager.getTable(this.fileEmbeddingsTable); - let existingFilesMap = new Map(); + const documentChunkTable = await this.databaseManager.getTable(this.documentChunkTable); + const existingFilesMap = new Map(); + const existingDocChunksMap = new Map(); + const escapedBaseDir = escapeSqlString(baseDir); - try { - const existingRecords = await fileTable - .query() - .where(`project_path = '${baseDir.replace(/'/g, "''")}'`) - .toArray(); - - for (const record of existingRecords) { - if (!existingFilesMap.has(record.path)) { - existingFilesMap.set(record.path, []); + if (fileTable) { + try { + const existingRecords = await fileTable.query().where(`project_path = '${escapedBaseDir}'`).toArray(); + for (const record of existingRecords) { + if (!existingFilesMap.has(record.path)) { + existingFilesMap.set(record.path, []); + } + existingFilesMap.get(record.path).push(record); } - existingFilesMap.get(record.path).push(record); + verboseLog(options, chalk.cyan(`Found ${existingRecords.length} existing file embeddings for comparison`)); + } catch (queryError) { + console.warn(chalk.yellow(`Warning: Could not query existing embeddings: ${queryError.message}`)); } + } - verboseLog({}, chalk.cyan(`Found ${existingRecords.length} existing embeddings for comparison`)); - } catch (queryError) { - console.warn(chalk.yellow(`Warning: Could not query existing embeddings: ${queryError.message}`)); + if (documentChunkTable) { + try { + const existingChunks = await documentChunkTable.query().where(`project_path = '${escapedBaseDir}'`).toArray(); + for (const chunk of existingChunks) { + if (!existingDocChunksMap.has(chunk.original_document_path)) { + existingDocChunksMap.set(chunk.original_document_path, []); + } + existingDocChunksMap.get(chunk.original_document_path).push(chunk); + } + verboseLog(options, chalk.cyan(`Found ${existingChunks.length} existing document chunks for comparison`)); + } catch (queryError) { + console.warn(chalk.yellow(`Warning: Could not query existing document chunks: ${queryError.message}`)); + } } - // ============================================================================ - // PHASE 3: FAST PRE-FILTERING (without reading file contents) - // ============================================================================ - // Filter files based on basic checks and file timestamps before reading content - const candidateFiles = []; + return { + baseDir, + fileTable, + documentChunkTable, + gitignoreCache, + existingFilesMap, + existingDocChunksMap, + liveFilePaths: new Set(), + liveDocumentPaths: new Set(), + }; + } + + async _prepareCandidateFiles(filePaths, exclusionOptions, sharedState, results, onProgress) { + const candidates = []; for (const filePath of filePaths) { - const absoluteFilePath = path.isAbsolute(filePath) ? path.resolve(filePath) : path.resolve(baseDir, filePath); - const consistentRelativePath = path.relative(baseDir, absoluteFilePath); + const absoluteFilePath = path.isAbsolute(filePath) ? path.resolve(filePath) : path.resolve(sharedState.baseDir, filePath); + const relativePath = path.relative(sharedState.baseDir, absoluteFilePath); try { - // Get file stats (size, mtime, etc.) const stats = fs.statSync(absoluteFilePath); + const language = detectLanguageFromExtension(path.extname(absoluteFilePath)); - // Check if file should be processed (using cached gitignore results) if ( !utilsShouldProcessFile(absoluteFilePath, '', { ...exclusionOptions, - baseDir: baseDir, - relativePathToCheck: consistentRelativePath, - gitignoreCache, // Pass the pre-computed cache - fileStats: stats, // Pass stats to avoid re-reading + baseDir: sharedState.baseDir, + relativePathToCheck: relativePath, + gitignoreCache: sharedState.gitignoreCache, + fileStats: stats, }) ) { results.excluded++; @@ -426,31 +471,14 @@ export class FileProcessor { continue; } - // Early skip based on modification time if file exists in database - const existingRecords = existingFilesMap.get(consistentRelativePath) || []; - let potentiallyUnchanged = false; - - if (existingRecords.length > 0) { - // Check if any existing record has the same modification time - for (const existing of existingRecords) { - const existingMtime = new Date(existing.last_modified).getTime(); - const currentMtime = stats.mtime.getTime(); - - // If modification times match (within 1 second to account for filesystem precision) - if (Math.abs(existingMtime - currentMtime) < 1000) { - potentiallyUnchanged = true; - break; - } - } - } - - candidateFiles.push({ + candidates.push({ filePath: absoluteFilePath, originalInputPath: filePath, - relativePath: consistentRelativePath, + relativePath, stats, - potentiallyUnchanged, - existingRecords, + language, + isDocumentation: isDocumentationFile(absoluteFilePath, language), + existingRecords: sharedState.existingFilesMap.get(relativePath) || [], }); } catch { results.failed++; @@ -461,113 +489,87 @@ export class FileProcessor { } } - verboseLog({}, chalk.cyan(`Pre-filtered to ${candidateFiles.length} candidate files (excluded ${results.excluded})`)); + return candidates; + } - // ============================================================================ - // PHASE 4: READ FILES AND CONTENT HASH CHECK - // ============================================================================ - // Now read file contents only for candidates that passed initial filtering - const filesToProcess = []; - const contentsForBatch = []; + async _processFileEmbeddings(candidates, sharedState, options = {}) { + const { batchSize = FILE_EMBEDDING_BATCH_SIZE, maxLines = 1000, onProgress, results, verbose = false } = options; - for (const fileData of candidateFiles) { - try { - // Read file content - let content = await fs.promises.readFile(fileData.filePath, 'utf8'); + for (let start = 0; start < candidates.length; start += batchSize) { + const batch = candidates.slice(start, start + batchSize); + const filesToActuallyProcess = []; + const contentsToActuallyProcess = []; + const recordsToDelete = new Map(); - // Check if empty - if (content.trim().length === 0) { - results.skipped++; - this.progressTracker.update('skipped'); - if (typeof onProgress === 'function') onProgress('skipped', fileData.originalInputPath); - this.processedFiles.set(fileData.originalInputPath, 'skipped_empty'); - continue; - } + for (const candidate of batch) { + try { + const rawContent = await fs.promises.readFile(candidate.filePath, 'utf8'); + candidate.fullContent = rawContent; - // Truncate content to maximum specified lines for code files only - const isDocFile = isDocumentationFile(fileData.filePath); - if (!isDocFile) { - const lines = content.split('\n'); - if (lines.length > maxLines) { - content = lines.slice(0, maxLines).join('\n') + '\n... (truncated from ' + lines.length + ' lines)'; - debug(`Truncated code file ${fileData.relativePath} from ${lines.length} lines to ${maxLines} lines`); + if (rawContent.trim().length === 0) { + results.skipped++; + this.progressTracker.update('skipped'); + if (typeof onProgress === 'function') onProgress('skipped', candidate.originalInputPath); + this.processedFiles.set(candidate.originalInputPath, 'skipped_empty'); + continue; } - } - // Add content to file data - fileData.content = content; - filesToProcess.push(fileData); - contentsForBatch.push(content); - } catch { - results.failed++; - results.failedFiles.push(fileData.originalInputPath); - this.progressTracker.update('failed'); - if (typeof onProgress === 'function') onProgress('failed', fileData.originalInputPath); - this.processedFiles.set(fileData.originalInputPath, 'failed_read'); - } - } + let embeddingContent = rawContent; + if (!candidate.isDocumentation) { + const lines = rawContent.split('\n'); + if (lines.length > maxLines) { + embeddingContent = lines.slice(0, maxLines).join('\n') + `\n... (truncated from ${lines.length} lines)`; + debug(`Truncated code file ${candidate.relativePath} from ${lines.length} lines to ${maxLines} lines`); + } + } + + candidate.embeddingContent = embeddingContent; + candidate.contentHash = createShortHash(embeddingContent); + sharedState.liveFilePaths.add(candidate.relativePath); - // ============================================================================ - // PHASE 5: CONTENT HASH CHECK AND DEDUPLICATION - // ============================================================================ - // Check each file against existing embeddings using content hash - const filesToActuallyProcess = []; - const contentsToActuallyProcess = []; - const recordsToDelete = []; - - for (let i = 0; i < filesToProcess.length; i++) { - const fileData = filesToProcess[i]; - const contentHash = createHash('md5').update(fileData.content).digest('hex').substring(0, 8); - - const existingRecords = fileData.existingRecords || []; - let needsUpdate = true; - - if (existingRecords.length > 0) { - // Check if any existing record matches our current file state - for (const existing of existingRecords) { - if (existing.content_hash === contentHash) { - // File content hasn't changed - skip processing (CI-friendly) - // Note: We rely on content_hash rather than last_modified because - // GitHub Actions checkout changes file timestamps even for unchanged files - needsUpdate = false; + const unchangedRecord = candidate.existingRecords.find((record) => record.content_hash === candidate.contentHash); + if (unchangedRecord) { results.skipped++; this.progressTracker.update('skipped'); - if (typeof onProgress === 'function') onProgress('skipped', fileData.originalInputPath); - this.processedFiles.set(fileData.originalInputPath, 'skipped_unchanged'); - debug(`Skipping unchanged file: ${fileData.relativePath} (hash: ${contentHash})`); - break; - } else if (existing.path === fileData.relativePath) { - // Same file path but different content - mark old version for deletion - recordsToDelete.push(existing); + if (typeof onProgress === 'function') onProgress('skipped', candidate.originalInputPath); + this.processedFiles.set(candidate.originalInputPath, 'skipped_unchanged'); + debug(`Skipping unchanged file: ${candidate.relativePath} (hash: ${candidate.contentHash})`); + continue; } - } - } - if (needsUpdate) { - // File needs processing (new or changed) - filesToActuallyProcess.push(fileData); - contentsToActuallyProcess.push(fileData.content); + for (const existingRecord of candidate.existingRecords) { + recordsToDelete.set(existingRecord.id, existingRecord); + } + + filesToActuallyProcess.push(candidate); + contentsToActuallyProcess.push(candidate.embeddingContent); + } catch { + candidate.readError = true; + results.failed++; + results.failedFiles.push(candidate.originalInputPath); + this.progressTracker.update('failed'); + if (typeof onProgress === 'function') onProgress('failed', candidate.originalInputPath); + this.processedFiles.set(candidate.originalInputPath, 'failed_read'); + } } - } - // Batch delete old versions if any - if (recordsToDelete.length > 0) { - for (const recordToDelete of recordsToDelete) { + for (const recordToDelete of recordsToDelete.values()) { try { - await fileTable.delete(`id = '${recordToDelete.id.replace(/'/g, "''")}'`); + await sharedState.fileTable.delete(`id = '${escapeSqlString(recordToDelete.id)}'`); debug(`Deleted old version: ${recordToDelete.path} (old hash: ${recordToDelete.content_hash})`); } catch (deleteError) { console.warn(chalk.yellow(`Warning: Could not delete old version of ${recordToDelete.path}: ${deleteError.message}`)); } } - } - // Generate embeddings only for files that need processing - if (filesToActuallyProcess.length > 0) { + if (filesToActuallyProcess.length === 0) { + continue; + } + verboseLog( - {}, + { verbose }, chalk.cyan( - `Processing ${filesToActuallyProcess.length} new/changed files (skipped ${filesToProcess.length - filesToActuallyProcess.length} unchanged)` + `Processing ${filesToActuallyProcess.length} new/changed files in batch (${Math.min(start + batch.length, candidates.length)}/${candidates.length})` ) ); @@ -576,42 +578,37 @@ export class FileProcessor { const recordsToAdd = []; for (let i = 0; i < embeddings.length; i++) { - const fileData = filesToActuallyProcess[i]; + const candidate = filesToActuallyProcess[i]; const embeddingVector = embeddings[i]; - if (embeddingVector) { - const contentHash = createHash('md5').update(fileData.content).digest('hex').substring(0, 8); - const fileId = `${fileData.relativePath}#${contentHash}`; - - const record = { - vector: embeddingVector, - id: fileId, - content: fileData.content, - type: 'file', - name: path.basename(fileData.filePath), - path: fileData.relativePath, - project_path: baseDir, - language: detectLanguageFromExtension(path.extname(fileData.filePath)), - content_hash: contentHash, - last_modified: fileData.stats.mtime.toISOString(), - }; - recordsToAdd.push(record); - } else { + if (!embeddingVector) { results.failed++; - results.failedFiles.push(fileData.originalInputPath); + results.failedFiles.push(candidate.originalInputPath); this.progressTracker.update('failed'); - if (typeof onProgress === 'function') onProgress('failed', fileData.originalInputPath); - this.processedFiles.set(fileData.originalInputPath, 'failed_embedding'); + if (typeof onProgress === 'function') onProgress('failed', candidate.originalInputPath); + this.processedFiles.set(candidate.originalInputPath, 'failed_embedding'); + continue; } + + recordsToAdd.push({ + vector: embeddingVector, + id: `${candidate.relativePath}#${candidate.contentHash}`, + content: candidate.embeddingContent, + type: 'file', + name: path.basename(candidate.filePath), + path: candidate.relativePath, + project_path: sharedState.baseDir, + language: candidate.language, + content_hash: candidate.contentHash, + last_modified: candidate.stats.mtime.toISOString(), + }); } - // Add new/updated records to database if (recordsToAdd.length > 0) { - await fileTable.add(recordsToAdd); + await sharedState.fileTable.add(recordsToAdd); - // Optimize table to sync indices with data and prevent TakeExec panics try { - await fileTable.optimize(); + await sharedState.fileTable.optimize(); } catch (optimizeError) { if (optimizeError.message && optimizeError.message.includes('legacy format')) { console.warn( @@ -624,30 +621,25 @@ export class FileProcessor { } } - recordsToAdd.forEach((record, index) => { - const fileData = filesToActuallyProcess[index]; - if (embeddings[index]) { - results.processed++; - results.files.push(fileData.originalInputPath); - this.progressTracker.update('processed'); - if (typeof onProgress === 'function') onProgress('processed', fileData.originalInputPath); - this.processedFiles.set(fileData.originalInputPath, 'processed'); - } - }); + for (const candidate of filesToActuallyProcess) { + results.processed++; + results.files.push(candidate.originalInputPath); + this.progressTracker.update('processed'); + if (typeof onProgress === 'function') onProgress('processed', candidate.originalInputPath); + this.processedFiles.set(candidate.originalInputPath, 'processed'); + } } } catch (error) { console.error(chalk.red(`Error processing batch: ${error.message}`)); - filesToProcess.forEach((fileData) => { + for (const candidate of filesToActuallyProcess) { results.failed++; - results.failedFiles.push(fileData.originalInputPath); + results.failedFiles.push(candidate.originalInputPath); this.progressTracker.update('failed'); - if (typeof onProgress === 'function') onProgress('failed', fileData.originalInputPath); - this.processedFiles.set(fileData.originalInputPath, 'failed_batch'); - }); + if (typeof onProgress === 'function') onProgress('failed', candidate.originalInputPath); + this.processedFiles.set(candidate.originalInputPath, 'failed_batch'); + } } } - - return results; } /** @@ -658,115 +650,81 @@ export class FileProcessor { * @returns {Promise} * @private */ - async _processDocumentChunks(filePaths, baseDir) { - verboseLog({}, chalk.cyan('--- Starting Phase 2: Document Chunk Embeddings ---')); - const documentChunkTable = await this.databaseManager.getTable(this.documentChunkTable); - if (!documentChunkTable) { + async _processDocumentChunks(candidates, sharedState, verbose = false) { + verboseLog({ verbose }, chalk.cyan('--- Starting Phase 2: Document Chunk Embeddings ---')); + if (!sharedState.documentChunkTable) { console.warn(chalk.yellow(`Skipping Phase 2: Document Chunk Embeddings because table ${this.documentChunkTable} was not found.`)); return; } - // Efficient batch check: Get all existing document chunks for this project - let existingDocChunksMap = new Map(); - try { - const existingChunks = await documentChunkTable - .query() - .where(`project_path = '${baseDir.replace(/'/g, "''")}'`) - .toArray(); - - // Build a map for fast lookup: original_document_path -> [chunks] - for (const chunk of existingChunks) { - if (!existingDocChunksMap.has(chunk.original_document_path)) { - existingDocChunksMap.set(chunk.original_document_path, []); - } - existingDocChunksMap.get(chunk.original_document_path).push(chunk); - } - - verboseLog({}, chalk.cyan(`Found ${existingChunks.length} existing document chunks for comparison`)); - } catch (queryError) { - console.warn(chalk.yellow(`Warning: Could not query existing document chunks, will process all docs: ${queryError.message}`)); - existingDocChunksMap = new Map(); - } - const allDocChunksToEmbed = []; const allDocChunkRecordsToAdd = []; const processedDocPathsForDeletion = new Set(); let skippedDocCount = 0; - for (const filePath of filePaths) { - const absoluteFilePath = path.isAbsolute(filePath) ? path.resolve(filePath) : path.resolve(baseDir, filePath); - const consistentRelativePath = path.relative(baseDir, absoluteFilePath); - const language = detectLanguageFromExtension(path.extname(absoluteFilePath)); - - if (isDocumentationFile(absoluteFilePath, language)) { - try { - const stats = fs.statSync(absoluteFilePath); - if (stats.size > 5 * 1024 * 1024) { - // 5MB limit for docs - continue; - } - - const content = await fs.promises.readFile(absoluteFilePath, 'utf8'); - if (content.trim().length === 0) { - continue; - } - - // Check if document has changed by comparing chunk content hashes - const existingChunks = existingDocChunksMap.get(consistentRelativePath) || []; + for (const candidate of candidates) { + if (!candidate.isDocumentation || candidate.readError) { + continue; + } - // Extract chunks to compare with existing ones - const { chunks: currentChunks, documentH1 } = extractMarkdownChunks(absoluteFilePath, content, consistentRelativePath); - let hasUnchangedDocument = false; + try { + if (candidate.stats.size > 5 * 1024 * 1024) { + continue; + } - if (existingChunks.length > 0 && currentChunks.length === existingChunks.length) { - // Create a signature of the document by combining all chunk content hashes - const currentChunkHashes = currentChunks - .map((chunk) => createHash('md5').update(chunk.content).digest('hex').substring(0, 8)) - .sort() - .join('|'); + if (!candidate.fullContent || candidate.fullContent.trim().length === 0) { + continue; + } - const existingChunkHashes = existingChunks - .map((chunk) => chunk.content_hash) - .sort() - .join('|'); + const existingChunks = sharedState.existingDocChunksMap.get(candidate.relativePath) || []; + const { chunks: currentChunks, documentH1 } = extractMarkdownChunks( + candidate.filePath, + candidate.fullContent, + candidate.relativePath + ); - hasUnchangedDocument = currentChunkHashes === existingChunkHashes; - } + const currentSignature = currentChunks + .map( + (chunk) => + `${buildDocumentChunkId(candidate.relativePath, chunk.heading, chunk.start_line_in_doc)}:${createShortHash(chunk.content)}` + ) + .sort() + .join('|'); + const existingSignature = buildExistingDocumentSignature(existingChunks); + + if (existingChunks.length > 0 && currentSignature === existingSignature) { + sharedState.liveDocumentPaths.add(candidate.relativePath); + skippedDocCount++; + debug(`Skipping unchanged document: ${candidate.relativePath} (${currentChunks.length} chunks match)`); + continue; + } - if (hasUnchangedDocument) { - // Document hasn't changed - skip processing - skippedDocCount++; - debug(`Skipping unchanged document: ${consistentRelativePath} (${currentChunks.length} chunks match)`); - continue; - } + if (currentChunks.length === 0) { + continue; + } - // Document has changed or is new - process it - if (!processedDocPathsForDeletion.has(consistentRelativePath)) { - processedDocPathsForDeletion.add(consistentRelativePath); - } + sharedState.liveDocumentPaths.add(candidate.relativePath); + processedDocPathsForDeletion.add(candidate.relativePath); - if (currentChunks.length > 0) { - currentChunks.forEach((chunk) => { - const chunkWithTitle = { - ...chunk, - documentTitle: documentH1 || path.basename(absoluteFilePath, path.extname(absoluteFilePath)), - fileStats: stats, - }; - allDocChunksToEmbed.push(chunkWithTitle); - }); - } - } catch (docError) { - console.warn(chalk.yellow(`Error processing document ${consistentRelativePath} for chunking: ${docError.message}`)); + for (const chunk of currentChunks) { + allDocChunksToEmbed.push({ + ...chunk, + documentTitle: documentH1 || path.basename(candidate.filePath, path.extname(candidate.filePath)), + fileStats: candidate.stats, + original_document_path: candidate.relativePath, + }); } + } catch (docError) { + console.warn(chalk.yellow(`Error processing document ${candidate.relativePath} for chunking: ${docError.message}`)); } } if (skippedDocCount > 0) { - verboseLog({}, chalk.cyan(`Skipped ${skippedDocCount} unchanged documentation files`)); + verboseLog({ verbose }, chalk.cyan(`Skipped ${skippedDocCount} unchanged documentation files`)); } if (allDocChunksToEmbed.length > 0) { - verboseLog({}, chalk.blue(`Extracted ${allDocChunksToEmbed.length} total document chunks to process for embeddings.`)); + verboseLog({ verbose }, chalk.blue(`Extracted ${allDocChunksToEmbed.length} total document chunks to process for embeddings.`)); const chunkContentsForBatching = allDocChunksToEmbed.map((chunk) => chunk.content); const chunkEmbeddings = await this.modelManager.calculateEmbeddingBatch(chunkContentsForBatching); @@ -775,14 +733,14 @@ export class FileProcessor { const chunkEmbeddingVector = chunkEmbeddings[i]; if (chunkEmbeddingVector) { - const chunkContentHash = createHash('md5').update(chunkData.content).digest('hex').substring(0, 8); - const chunkId = `${chunkData.original_document_path}#${slugify(chunkData.heading || 'section')}_${chunkData.start_line_in_doc}`; + const chunkContentHash = createShortHash(chunkData.content); + const chunkId = buildDocumentChunkId(chunkData.original_document_path, chunkData.heading, chunkData.start_line_in_doc); const record = { id: chunkId, content: chunkData.content, original_document_path: chunkData.original_document_path, - project_path: baseDir, + project_path: sharedState.baseDir, heading_text: chunkData.heading || '', document_title: chunkData.documentTitle, language: chunkData.language || 'markdown', @@ -799,7 +757,9 @@ export class FileProcessor { if (processedDocPathsForDeletion.size > 0) { for (const docPathToDelete of processedDocPathsForDeletion) { try { - await documentChunkTable.delete(`original_document_path = '${docPathToDelete.replace(/'/g, "''")}'`); + await sharedState.documentChunkTable.delete( + `project_path = '${escapeSqlString(sharedState.baseDir)}' AND original_document_path = '${escapeSqlString(docPathToDelete)}'` + ); } catch (deleteError) { console.warn(chalk.yellow(`Error deleting chunks for document ${docPathToDelete}: ${deleteError.message}`)); } @@ -808,11 +768,11 @@ export class FileProcessor { if (allDocChunkRecordsToAdd.length > 0) { try { - await documentChunkTable.add(allDocChunkRecordsToAdd); + await sharedState.documentChunkTable.add(allDocChunkRecordsToAdd); // Optimize table to sync indices with data and prevent TakeExec panics try { - await documentChunkTable.optimize(); + await sharedState.documentChunkTable.optimize(); } catch (optimizeError) { if (optimizeError.message && optimizeError.message.includes('legacy format')) { console.warn(chalk.yellow(`Skipping optimization due to legacy index format - will be auto-upgraded during normal operations`)); @@ -822,7 +782,7 @@ export class FileProcessor { } verboseLog( - {}, + { verbose }, chalk.green(`Successfully added ${allDocChunkRecordsToAdd.length} document chunk embeddings to ${this.documentChunkTable}.`) ); } catch (addError) { @@ -830,7 +790,19 @@ export class FileProcessor { } } - verboseLog({}, chalk.green('--- Finished Phase 2: Document Chunk Embeddings ---')); + verboseLog({ verbose }, chalk.green('--- Finished Phase 2: Document Chunk Embeddings ---')); + } + + async _pruneStaleEmbeddings(sharedState, verbose = false) { + try { + const [prunedFiles, prunedDocs] = await Promise.all([ + this.databaseManager.pruneProjectFileEmbeddings(sharedState.baseDir, sharedState.liveFilePaths), + this.databaseManager.pruneProjectDocumentChunks(sharedState.baseDir, sharedState.liveDocumentPaths), + ]); + verboseLog({ verbose }, chalk.cyan(`Pruned ${prunedFiles} stale file embeddings and ${prunedDocs} stale document chunk embeddings.`)); + } catch (error) { + console.warn(chalk.yellow(`Warning: Failed to prune stale embeddings: ${error.message}`)); + } } // ============================================================================ diff --git a/src/embeddings/file-processor.test.js b/src/embeddings/file-processor.test.js index 0d06ad7..a051632 100644 --- a/src/embeddings/file-processor.test.js +++ b/src/embeddings/file-processor.test.js @@ -174,7 +174,11 @@ describe('FileProcessor', () => { }); it('should delete existing structure embedding before adding new one', async () => { - fs.readdirSync.mockReturnValue([]); + fs.readdirSync.mockReturnValue([createDirEntry('file.js')]); + mockTable.query.mockReturnValue({ + where: vi.fn().mockReturnThis(), + toArray: vi.fn().mockResolvedValue([{ id: 'existing-structure', content_hash: 'different' }]), + }); await processor.generateDirectoryStructureEmbedding(); expect(mockTable.delete).toHaveBeenCalled(); }); @@ -281,7 +285,7 @@ describe('FileProcessor', () => { }); it('should handle file table not found', async () => { - mockDatabaseManager.getTable.mockResolvedValueOnce(mockTable).mockResolvedValueOnce(null); + mockDatabaseManager.getTable.mockResolvedValue(null); fs.readdirSync.mockReturnValue([]); const result = await processor.processBatchEmbeddings(['/test/file.js']); expect(result.failed).toBe(1); @@ -371,7 +375,19 @@ describe('FileProcessor', () => { toArray: vi.fn().mockResolvedValue([{ path: 'file.js', content_hash: 'abc12345', last_modified: new Date().toISOString() }]), }); const result = await processor.processBatchEmbeddings(['/test/file.js'], { baseDir: '/test' }); - expect(result.skipped).toBeGreaterThanOrEqual(0); + expect(result.skipped).toBeGreaterThan(0); + expect(mockModelManager.calculateEmbeddingBatch).not.toHaveBeenCalled(); + }); + + it('should ignore CI-style mtime churn when content hash is unchanged', async () => { + setupFileSystemMocks(); + mockTable.query.mockReturnValue({ + where: vi.fn().mockReturnThis(), + toArray: vi.fn().mockResolvedValue([{ path: 'file.js', content_hash: 'abc12345', last_modified: '2000-01-01T00:00:00.000Z' }]), + }); + const result = await processor.processBatchEmbeddings(['/test/file.js'], { baseDir: '/test' }); + expect(result.skipped).toBeGreaterThan(0); + expect(mockModelManager.calculateEmbeddingBatch).not.toHaveBeenCalled(); }); it('should delete old version when content hash differs', async () => { @@ -469,7 +485,9 @@ describe('FileProcessor', () => { documentH1: 'Document Title', }); setupFileSystemMocks('# Title\n\n## Section 1\nContent'); - mockModelManager.calculateEmbeddingBatch.mockResolvedValue([createMockEmbedding(), createMockEmbedding()]); + mockModelManager.calculateEmbeddingBatch + .mockResolvedValueOnce([createMockEmbedding()]) + .mockResolvedValueOnce([createMockEmbedding(), createMockEmbedding()]); expect(await processor.processBatchEmbeddings(['/test/doc.md'], { baseDir: '/test' })).toBeDefined(); }); @@ -481,10 +499,13 @@ describe('FileProcessor', () => { const docMockTable = createMockTable({ query: vi.fn().mockReturnValue({ where: vi.fn().mockReturnThis(), - toArray: vi.fn().mockResolvedValue([{ original_document_path: 'doc.md', content_hash: 'abc12345' }]), + toArray: vi.fn().mockResolvedValue([{ id: 'doc.md#section-1_1', original_document_path: 'doc.md', content_hash: 'abc12345' }]), }), }); - mockDatabaseManager.getTable.mockResolvedValueOnce(mockTable).mockResolvedValueOnce(mockTable).mockResolvedValueOnce(docMockTable); + mockDatabaseManager.getTable.mockImplementation((tableName) => { + if (tableName === 'document_chunk_embeddings') return Promise.resolve(docMockTable); + return Promise.resolve(mockTable); + }); setupFileSystemMocks('# Title\n\nChunk 1'); expect(await processor.processBatchEmbeddings(['/test/doc.md'], { baseDir: '/test' })).toBeDefined(); }); @@ -506,13 +527,14 @@ describe('FileProcessor', () => { documentH1: 'Title', }); const docMockTable = createMockTable(); - let callCount = 0; - mockDatabaseManager.getTable.mockImplementation(() => { - callCount++; - return Promise.resolve(callCount === 3 ? docMockTable : mockTable); + mockDatabaseManager.getTable.mockImplementation((tableName) => { + if (tableName === 'document_chunk_embeddings') return Promise.resolve(docMockTable); + return Promise.resolve(mockTable); }); setupFileSystemMocks('# Title\n\nContent'); - mockModelManager.calculateEmbeddingBatch.mockResolvedValue([createMockEmbedding()]); + mockModelManager.calculateEmbeddingBatch + .mockResolvedValueOnce([createMockEmbedding()]) + .mockResolvedValueOnce([createMockEmbedding()]); await processor.processBatchEmbeddings(['/test/doc.md'], { baseDir: '/test' }); expect(docMockTable.add).toHaveBeenCalled(); }); @@ -523,13 +545,14 @@ describe('FileProcessor', () => { documentH1: 'Title', }); const docMockTable = createMockTable({ optimize: vi.fn().mockRejectedValue(new Error('legacy format')) }); - let callCount = 0; - mockDatabaseManager.getTable.mockImplementation(() => { - callCount++; - return Promise.resolve(callCount === 3 ? docMockTable : mockTable); + mockDatabaseManager.getTable.mockImplementation((tableName) => { + if (tableName === 'document_chunk_embeddings') return Promise.resolve(docMockTable); + return Promise.resolve(mockTable); }); setupFileSystemMocks('# Title\n\nContent'); - mockModelManager.calculateEmbeddingBatch.mockResolvedValue([createMockEmbedding()]); + mockModelManager.calculateEmbeddingBatch + .mockResolvedValueOnce([createMockEmbedding()]) + .mockResolvedValueOnce([createMockEmbedding()]); await processor.processBatchEmbeddings(['/test/doc.md'], { baseDir: '/test' }); expect(console.warn).toHaveBeenCalledWith(expect.stringContaining('legacy index format')); }); @@ -542,6 +565,27 @@ describe('FileProcessor', () => { fs.promises.readFile.mockResolvedValue('# Title'); expect(await processor.processBatchEmbeddings(['/test/error.md'], { baseDir: '/test' })).toBeDefined(); }); + + it('should skip doc chunk generation for excluded markdown files', async () => { + shouldProcessFile.mockReturnValue(false); + setupFileSystemMocks('# Title'); + await processor.processBatchEmbeddings(['/test/doc.md'], { baseDir: '/test' }); + expect(extractMarkdownChunks).not.toHaveBeenCalled(); + }); + + it('should prune stale embeddings on full scans', async () => { + setupFileSystemMocks('# Title'); + await processor.processBatchEmbeddings(['/test/doc.md'], { baseDir: '/test', runMode: 'full' }); + expect(mockDatabaseManager.pruneProjectFileEmbeddings).toHaveBeenCalledWith('/test', expect.any(Set)); + expect(mockDatabaseManager.pruneProjectDocumentChunks).toHaveBeenCalledWith('/test', expect.any(Set)); + }); + + it('should not prune stale embeddings on partial runs', async () => { + setupFileSystemMocks('# Title'); + await processor.processBatchEmbeddings(['/test/doc.md'], { baseDir: '/test', runMode: 'partial' }); + expect(mockDatabaseManager.pruneProjectFileEmbeddings).not.toHaveBeenCalled(); + expect(mockDatabaseManager.pruneProjectDocumentChunks).not.toHaveBeenCalled(); + }); }); // ========================================================================== diff --git a/src/index.js b/src/index.js index 90b032b..838d6e3 100755 --- a/src/index.js +++ b/src/index.js @@ -519,6 +519,7 @@ async function generateEmbeddings(options) { // Get files to process let filesToProcess = []; + const runMode = options.files && options.files.length > 0 ? 'partial' : 'full'; if (options.files && options.files.length > 0) { console.log(chalk.cyan('Processing specified files/patterns...')); @@ -585,6 +586,7 @@ async function generateEmbeddings(options) { baseDir: baseDir, batchSize: 100, // Set a reasonable batch size maxLines: parseInt(options.maxLines || '1000', 10), + runMode, onProgress: (status) => { // Update counters based on status if (status === 'processed') { @@ -625,9 +627,6 @@ async function generateEmbeddings(options) { forceAnalysis: options.forceAnalysis, }); - // Store project summary in embeddings system for later use - await embeddingsSystem.storeProjectSummary(projectDir, projectSummary); - console.log(chalk.green('✅ Project analysis complete and stored')); verboseLog(options, chalk.gray(` Project: ${projectSummary.projectName}`)); verboseLog( diff --git a/src/project-analyzer.js b/src/project-analyzer.js index a286941..bce2ddf 100644 --- a/src/project-analyzer.js +++ b/src/project-analyzer.js @@ -209,12 +209,17 @@ export class ProjectAnalyzer { // Check for existing analysis const existingSummary = forceAnalysis ? null : await this.loadExistingAnalysis(projectPath); if (existingSummary && !forceAnalysis) { - const currentHash = await this.calculateKeyFilesHash(existingSummary.keyFiles); - if (existingSummary.keyFilesHash === currentHash) { - verboseLog(verbose, chalk.green('✅ Project analysis up-to-date (no key file changes detected)')); - return existingSummary; + const currentEmbeddingInventoryHash = await this.calculateEmbeddingInventoryHash(projectPath); + if (existingSummary.embeddingInventoryHash !== currentEmbeddingInventoryHash) { + verboseLog(verbose, chalk.yellow('🔄 Embedding inventory changed, regenerating analysis...')); + } else { + const currentHash = await this.calculateKeyFilesHash(existingSummary.keyFiles); + if (existingSummary.keyFilesHash === currentHash) { + verboseLog(verbose, chalk.green('✅ Project analysis up-to-date (no key file changes detected)')); + return existingSummary; + } + verboseLog(verbose, chalk.yellow('🔄 Key files changed, regenerating analysis...')); } - verboseLog(verbose, chalk.yellow('🔄 Key files changed, regenerating analysis...')); } else { verboseLog( verbose, @@ -241,6 +246,7 @@ export class ProjectAnalyzer { const currentHash = await this.calculateKeyFilesHash(keyFiles); projectSummary.keyFiles = keyFiles; projectSummary.keyFilesHash = currentHash; + projectSummary.embeddingInventoryHash = await this.calculateEmbeddingInventoryHash(projectPath); await this.storeAnalysis(projectPath, projectSummary); @@ -613,6 +619,39 @@ Select files following the criteria in the system instructions.`; return hash.digest('hex'); } + /** + * Calculate a project-scoped hash of the current embedding inventory. + */ + async calculateEmbeddingInventoryHash(projectPath) { + try { + const embeddingsSystem = getDefaultEmbeddingsSystem(); + await embeddingsSystem.initialize(); + const table = await embeddingsSystem.databaseManager.getTable(embeddingsSystem.databaseManager.fileEmbeddingsTable); + + if (!table) { + return 'no-file-embeddings-table'; + } + + const records = await table + .query() + .select(['type', 'path', 'content_hash', 'project_path']) + .where(`project_path = '${projectPath.replace(/'/g, "''")}'`) + .toArray(); + + const hash = crypto.createHash('sha256'); + const normalizedRows = records.map((record) => `${record.type || 'file'}:${record.path || ''}:${record.content_hash || ''}`).sort(); + + for (const row of normalizedRows) { + hash.update(row); + } + + return hash.digest('hex'); + } catch (error) { + verboseLog({}, chalk.yellow(`Warning: Could not calculate embedding inventory hash: ${error.message}`)); + return 'embedding-inventory-unavailable'; + } + } + /** * Generate comprehensive project summary using LLM analysis (SINGLE CALL) */ diff --git a/src/rag-analyzer.test.js b/src/rag-analyzer.test.js index 929ba83..1696409 100644 --- a/src/rag-analyzer.test.js +++ b/src/rag-analyzer.test.js @@ -910,6 +910,15 @@ describe('rag-analyzer', () => { expect(context).toHaveProperty('codeExamples'); }); + it('should degrade gracefully when retrieval returns no context for active files', async () => { + mockEmbeddingsSystem.findSimilarCode.mockResolvedValue([]); + mockEmbeddingsSystem.findRelevantDocs.mockResolvedValue([]); + const prFiles = [{ filePath: '/src/file.js', content: 'code', language: 'javascript' }]; + const context = await gatherUnifiedContextForPR(prFiles, { projectPath: '/project' }); + expect(context.codeExamples).toEqual([]); + expect(context.guidelines).toEqual([]); + }); + it('should find custom document chunks', async () => { mockEmbeddingsSystem.getExistingCustomDocumentChunks.mockResolvedValue([{ content: 'Custom doc', document_title: 'Guidelines' }]); const prFiles = [{ filePath: '/src/file.js', content: 'code', language: 'javascript' }]; diff --git a/src/test-utils/fixtures.js b/src/test-utils/fixtures.js index 9486f80..d3433ea 100644 --- a/src/test-utils/fixtures.js +++ b/src/test-utils/fixtures.js @@ -73,6 +73,8 @@ export function createMockDatabaseManager(mockTable = null) { // Embeddings operations clearAllEmbeddings: vi.fn().mockResolvedValue(true), clearProjectEmbeddings: vi.fn().mockResolvedValue(true), + pruneProjectFileEmbeddings: vi.fn().mockResolvedValue(0), + pruneProjectDocumentChunks: vi.fn().mockResolvedValue(0), // Project summary storeProjectSummary: vi.fn().mockResolvedValue(undefined), getProjectSummary: vi.fn().mockResolvedValue(null), diff --git a/src/utils/path-utils.js b/src/utils/path-utils.js new file mode 100644 index 0000000..83e5a16 --- /dev/null +++ b/src/utils/path-utils.js @@ -0,0 +1,13 @@ +import path from 'node:path'; + +/** + * Check whether an absolute path is contained within a project root. + * + * @param {string} absolutePath - Absolute path to validate + * @param {string} projectPath - Project root path + * @returns {boolean} True when the path is inside the project + */ +export function isPathWithinProject(absolutePath, projectPath) { + const relativePath = path.relative(projectPath, absolutePath); + return relativePath === '' || (!relativePath.startsWith('..') && !path.isAbsolute(relativePath)); +} diff --git a/src/utils/path-utils.test.js b/src/utils/path-utils.test.js new file mode 100644 index 0000000..8e58e0d --- /dev/null +++ b/src/utils/path-utils.test.js @@ -0,0 +1,19 @@ +import { isPathWithinProject } from './path-utils.js'; + +describe('isPathWithinProject', () => { + it('should accept a path inside the project', () => { + expect(isPathWithinProject('/repo/src/file.js', '/repo')).toBe(true); + }); + + it('should accept the project root itself', () => { + expect(isPathWithinProject('/repo', '/repo')).toBe(true); + }); + + it('should reject sibling project prefix collisions', () => { + expect(isPathWithinProject('/repo-old/src/file.js', '/repo')).toBe(false); + }); + + it('should reject paths outside the project', () => { + expect(isPathWithinProject('/other/file.js', '/repo')).toBe(false); + }); +}); diff --git a/src/utils/string-utils.js b/src/utils/string-utils.js index 950acd2..aacb36d 100644 --- a/src/utils/string-utils.js +++ b/src/utils/string-utils.js @@ -51,3 +51,13 @@ export function addLineNumbers(content) { const padding = String(lines.length).length; return lines.map((line, i) => `${String(i + 1).padStart(padding)} | ${line}`).join('\n'); } + +/** + * Escape single quotes for safe use in SQL string literals. + * + * @param {string} value - Value to escape + * @returns {string} Escaped value + */ +export function escapeSqlString(value) { + return String(value).replace(/'/g, "''"); +} diff --git a/src/utils/string-utils.test.js b/src/utils/string-utils.test.js index d020834..44d8690 100644 --- a/src/utils/string-utils.test.js +++ b/src/utils/string-utils.test.js @@ -1,4 +1,4 @@ -import { slugify, addLineNumbers } from './string-utils.js'; +import { slugify, addLineNumbers, escapeSqlString } from './string-utils.js'; describe('slugify', () => { describe('basic transformations', () => { @@ -153,3 +153,17 @@ describe('addLineNumbers', () => { }); }); }); + +describe('escapeSqlString', () => { + it('should escape single quotes', () => { + expect(escapeSqlString("it's fine")).toBe("it''s fine"); + }); + + it('should coerce non-string values', () => { + expect(escapeSqlString(42)).toBe('42'); + }); + + it('should leave strings without single quotes unchanged', () => { + expect(escapeSqlString('plain text')).toBe('plain text'); + }); +});