From 61785f7bd09aa8d0bce043f09850a454b1c73a54 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 25 Feb 2026 02:25:33 -0700 Subject: [PATCH 1/3] feat: add git co-change analysis (backlog #9) Analyze git history to surface files that historically change together, using Jaccard similarity coefficients. Results are stored in the DB and integrated into diff-impact to catch temporal coupling the static graph misses. - New src/cochange.js module with scan, compute, analyze, and query functions - DB migration v5: co_changes + co_change_meta tables - CLI: `codegraph co-change [file]` with --analyze, --since, --min-support, etc. - diff-impact now shows historicallyCoupled files when co-change data exists - MCP: new co_changes tool for AI agent access - 19 new tests covering pure logic, DB integration, and real git repos Impact: 13 functions changed, 9 affected --- src/cli.js | 62 ++++ src/cochange.js | 469 +++++++++++++++++++++++++++++ src/config.js | 6 + src/db.js | 21 ++ src/index.js | 9 + src/mcp.js | 36 +++ src/queries.js | 52 +++- tests/integration/cochange.test.js | 283 +++++++++++++++++ tests/unit/mcp.test.js | 1 + 9 files changed, 936 insertions(+), 3 deletions(-) create mode 100644 src/cochange.js create mode 100644 tests/integration/cochange.test.js diff --git a/src/cli.js b/src/cli.js index 60dbac63..f85d3d6d 100644 --- a/src/cli.js +++ b/src/cli.js @@ -562,6 +562,68 @@ program }); }); +program + .command('co-change [file]') + .description( + 'Analyze git history for files that change together. Use --analyze to scan, or query existing data.', + ) + .option('--analyze', 'Scan git history and populate co-change data') + .option('--since ', 'Git date for history window (default: "1 year ago")') + .option('--min-support ', 'Minimum co-occurrence count (default: 3)') + .option('--min-jaccard ', 'Minimum Jaccard similarity 0-1 (default: 0.3)') + .option('--full', 'Force full re-scan (ignore incremental state)') + .option('-n, --limit ', 'Max results', '20') + .option('-d, --db ', 'Path to graph.db') + .option('-T, --no-tests', 'Exclude test/spec files') + .option('--include-tests', 'Include test/spec files (overrides excludeTests config)') + .option('-j, --json', 'Output as JSON') + .action(async (file, opts) => { + const { analyzeCoChanges, coChangeData, coChangeTopData, formatCoChange, formatCoChangeTop } = + await import('./cochange.js'); + + if (opts.analyze) { + const result = analyzeCoChanges(opts.db, { + since: opts.since || config.coChange?.since, + minSupport: opts.minSupport ? parseInt(opts.minSupport, 10) : config.coChange?.minSupport, + maxFilesPerCommit: config.coChange?.maxFilesPerCommit, + full: opts.full, + }); + if (opts.json) { + console.log(JSON.stringify(result, null, 2)); + } else if (result.error) { + console.error(result.error); + process.exit(1); + } else { + console.log( + `\nCo-change analysis complete: ${result.pairsFound} pairs from ${result.commitsScanned} commits (since: ${result.since})\n`, + ); + } + return; + } + + const queryOpts = { + limit: parseInt(opts.limit, 10), + minJaccard: opts.minJaccard ? parseFloat(opts.minJaccard) : config.coChange?.minJaccard, + noTests: resolveNoTests(opts), + }; + + if (file) { + const data = coChangeData(file, opts.db, queryOpts); + if (opts.json) { + console.log(JSON.stringify(data, null, 2)); + } else { + console.log(formatCoChange(data)); + } + } else { + const data = coChangeTopData(opts.db, queryOpts); + if (opts.json) { + console.log(JSON.stringify(data, null, 2)); + } else { + console.log(formatCoChangeTop(data)); + } + } + }); + program .command('watch [dir]') .description('Watch project for file changes and incrementally update the graph') diff --git a/src/cochange.js b/src/cochange.js new file mode 100644 index 00000000..4545c23a --- /dev/null +++ b/src/cochange.js @@ -0,0 +1,469 @@ +/** + * Git co-change analysis — surfaces files that historically change together. + * + * Uses git log to find temporal coupling between files, computes Jaccard + * similarity coefficients, and stores results in the codegraph database. + */ + +import { execFileSync } from 'node:child_process'; +import fs from 'node:fs'; +import path from 'node:path'; +import { normalizePath } from './constants.js'; +import { findDbPath, initSchema, openDb, openReadonlyOrFail } from './db.js'; +import { warn } from './logger.js'; +import { isTestFile } from './queries.js'; + +/** + * Scan git history and return parsed commit data. + * @param {string} repoRoot - Absolute path to the git repo root + * @param {object} [opts] + * @param {string} [opts.since] - Git date expression (e.g. "1 year ago") + * @param {string} [opts.afterSha] - Only include commits after this SHA + * @returns {{ commits: Array<{sha: string, epoch: number, files: string[]}> }} + */ +export function scanGitHistory(repoRoot, opts = {}) { + const args = [ + 'log', + '--name-only', + '--pretty=format:%H%n%at', + '--no-merges', + '--diff-filter=AMRC', + ]; + if (opts.since) args.push(`--since=${opts.since}`); + if (opts.afterSha) args.push(`${opts.afterSha}..HEAD`); + args.push('--', '.'); + + let output; + try { + output = execFileSync('git', args, { + cwd: repoRoot, + encoding: 'utf-8', + maxBuffer: 50 * 1024 * 1024, + stdio: ['pipe', 'pipe', 'pipe'], + }); + } catch (e) { + warn(`Failed to scan git history: ${e.message}`); + return { commits: [] }; + } + + if (!output.trim()) return { commits: [] }; + + const commits = []; + // Split on double newlines to get blocks; each block is sha\nepoch\nfile1\nfile2... + const blocks = output.trim().split(/\n\n+/); + for (const block of blocks) { + const lines = block.split('\n').filter((l) => l.length > 0); + if (lines.length < 2) continue; + const sha = lines[0]; + const epoch = parseInt(lines[1], 10); + if (Number.isNaN(epoch)) continue; + const files = lines.slice(2).map((f) => normalizePath(f)); + if (files.length > 0) { + commits.push({ sha, epoch, files }); + } + } + + return { commits }; +} + +/** + * Compute co-change pairs from parsed commit data. + * @param {Array<{sha: string, epoch: number, files: string[]}>} commits + * @param {object} [opts] + * @param {number} [opts.minSupport=3] - Minimum number of co-occurrences + * @param {number} [opts.maxFilesPerCommit=50] - Skip commits with too many files + * @param {Set} [opts.knownFiles] - If provided, only include pairs where both files are in this set + * @returns {Map} + */ +export function computeCoChanges(commits, opts = {}) { + const minSupport = opts.minSupport ?? 3; + const maxFilesPerCommit = opts.maxFilesPerCommit ?? 50; + const knownFiles = opts.knownFiles || null; + + const fileCommitCounts = new Map(); + const pairCounts = new Map(); + const pairLastEpoch = new Map(); + + for (const commit of commits) { + let { files } = commit; + if (files.length > maxFilesPerCommit) continue; + + if (knownFiles) { + files = files.filter((f) => knownFiles.has(f)); + } + + // Count per-file commits + for (const f of files) { + fileCommitCounts.set(f, (fileCommitCounts.get(f) || 0) + 1); + } + + // Generate all unique pairs (canonical: a < b) + const sorted = [...new Set(files)].sort(); + for (let i = 0; i < sorted.length; i++) { + for (let j = i + 1; j < sorted.length; j++) { + const key = `${sorted[i]}\0${sorted[j]}`; + pairCounts.set(key, (pairCounts.get(key) || 0) + 1); + const prev = pairLastEpoch.get(key) || 0; + if (commit.epoch > prev) pairLastEpoch.set(key, commit.epoch); + } + } + } + + // Filter by minSupport and compute Jaccard + const results = new Map(); + for (const [key, count] of pairCounts) { + if (count < minSupport) continue; + const [fileA, fileB] = key.split('\0'); + const countA = fileCommitCounts.get(fileA) || 0; + const countB = fileCommitCounts.get(fileB) || 0; + const jaccard = count / (countA + countB - count); + results.set(key, { + commitCount: count, + jaccard, + lastEpoch: pairLastEpoch.get(key) || 0, + }); + } + + return results; +} + +/** + * Analyze git history and populate co-change data in the database. + * @param {string} [customDbPath] - Path to graph.db + * @param {object} [opts] + * @param {string} [opts.since] - Git date expression + * @param {number} [opts.minSupport] - Minimum co-occurrence count + * @param {number} [opts.maxFilesPerCommit] - Max files per commit + * @param {boolean} [opts.full] - Force full re-scan + * @returns {{ pairsFound: number, commitsScanned: number, since: string, minSupport: number }} + */ +export function analyzeCoChanges(customDbPath, opts = {}) { + const dbPath = findDbPath(customDbPath); + const db = openDb(dbPath); + initSchema(db); + + const repoRoot = path.resolve(path.dirname(dbPath), '..'); + + if (!fs.existsSync(path.join(repoRoot, '.git'))) { + db.close(); + return { error: `Not a git repository: ${repoRoot}` }; + } + + const since = opts.since || '1 year ago'; + const minSupport = opts.minSupport ?? 3; + const maxFilesPerCommit = opts.maxFilesPerCommit ?? 50; + + // Check for incremental state + let afterSha = null; + if (!opts.full) { + try { + const row = db + .prepare("SELECT value FROM co_change_meta WHERE key = 'last_analyzed_commit'") + .get(); + if (row) afterSha = row.value; + } catch { + /* table may not exist yet */ + } + } + + // If full re-scan, clear existing data + if (opts.full) { + db.exec('DELETE FROM co_changes'); + db.exec('DELETE FROM co_change_meta'); + } + + // Collect known files from the graph for filtering + let knownFiles = null; + try { + const rows = db.prepare('SELECT DISTINCT file FROM nodes').all(); + knownFiles = new Set(rows.map((r) => r.file)); + } catch { + /* nodes table may not exist */ + } + + const { commits } = scanGitHistory(repoRoot, { since, afterSha }); + const coChanges = computeCoChanges(commits, { minSupport, maxFilesPerCommit, knownFiles }); + + // Write results + const upsert = db.prepare(` + INSERT INTO co_changes (file_a, file_b, commit_count, jaccard, last_commit_epoch) + VALUES (?, ?, ?, ?, ?) + ON CONFLICT(file_a, file_b) DO UPDATE SET + commit_count = commit_count + excluded.commit_count, + jaccard = excluded.jaccard, + last_commit_epoch = MAX(co_changes.last_commit_epoch, excluded.last_commit_epoch) + `); + + const insertMany = db.transaction((pairs) => { + for (const [key, data] of pairs) { + const [fileA, fileB] = key.split('\0'); + upsert.run(fileA, fileB, data.commitCount, data.jaccard, data.lastEpoch); + } + }); + insertMany(coChanges); + + // Update metadata + const metaUpsert = db.prepare(` + INSERT INTO co_change_meta (key, value) VALUES (?, ?) + ON CONFLICT(key) DO UPDATE SET value = excluded.value + `); + if (commits.length > 0) { + metaUpsert.run('last_analyzed_commit', commits[0].sha); + } + metaUpsert.run('analyzed_at', new Date().toISOString()); + metaUpsert.run('since', since); + metaUpsert.run('min_support', String(minSupport)); + + const totalPairs = db.prepare('SELECT COUNT(*) as cnt FROM co_changes').get().cnt; + + db.close(); + + return { + pairsFound: totalPairs, + commitsScanned: commits.length, + since, + minSupport, + }; +} + +/** + * Query co-change partners for a specific file. + * @param {string} file - File path (partial match supported) + * @param {string} [customDbPath] + * @param {object} [opts] + * @param {number} [opts.limit=20] + * @param {number} [opts.minJaccard=0.3] + * @param {boolean} [opts.noTests] + * @returns {{ file: string, partners: Array, meta: object }} + */ +export function coChangeData(file, customDbPath, opts = {}) { + const db = openReadonlyOrFail(customDbPath); + const limit = opts.limit || 20; + const minJaccard = opts.minJaccard ?? 0.3; + const noTests = opts.noTests || false; + + // Check if co_changes table exists + try { + db.prepare('SELECT 1 FROM co_changes LIMIT 1').get(); + } catch { + db.close(); + return { error: 'No co-change data found. Run `codegraph co-change --analyze` first.' }; + } + + // Resolve file via partial match + const resolvedFile = resolveCoChangeFile(db, file); + if (!resolvedFile) { + db.close(); + return { error: `No co-change data found for file matching "${file}"` }; + } + + const rows = db + .prepare( + `SELECT file_a, file_b, commit_count, jaccard, last_commit_epoch + FROM co_changes + WHERE (file_a = ? OR file_b = ?) AND jaccard >= ? + ORDER BY jaccard DESC`, + ) + .all(resolvedFile, resolvedFile, minJaccard); + + const partners = []; + for (const row of rows) { + const partner = row.file_a === resolvedFile ? row.file_b : row.file_a; + if (noTests && isTestFile(partner)) continue; + partners.push({ + file: partner, + commitCount: row.commit_count, + jaccard: row.jaccard, + lastCommitDate: row.last_commit_epoch + ? new Date(row.last_commit_epoch * 1000).toISOString().slice(0, 10) + : null, + }); + if (partners.length >= limit) break; + } + + const meta = getCoChangeMeta(db); + db.close(); + + return { file: resolvedFile, partners, meta }; +} + +/** + * Query top global co-change pairs. + * @param {string} [customDbPath] + * @param {object} [opts] + * @param {number} [opts.limit=20] + * @param {number} [opts.minJaccard=0.3] + * @param {boolean} [opts.noTests] + * @returns {{ pairs: Array, meta: object }} + */ +export function coChangeTopData(customDbPath, opts = {}) { + const db = openReadonlyOrFail(customDbPath); + const limit = opts.limit || 20; + const minJaccard = opts.minJaccard ?? 0.3; + const noTests = opts.noTests || false; + + try { + db.prepare('SELECT 1 FROM co_changes LIMIT 1').get(); + } catch { + db.close(); + return { error: 'No co-change data found. Run `codegraph co-change --analyze` first.' }; + } + + const rows = db + .prepare( + `SELECT file_a, file_b, commit_count, jaccard, last_commit_epoch + FROM co_changes + WHERE jaccard >= ? + ORDER BY jaccard DESC`, + ) + .all(minJaccard); + + const pairs = []; + for (const row of rows) { + if (noTests && (isTestFile(row.file_a) || isTestFile(row.file_b))) continue; + pairs.push({ + fileA: row.file_a, + fileB: row.file_b, + commitCount: row.commit_count, + jaccard: row.jaccard, + lastCommitDate: row.last_commit_epoch + ? new Date(row.last_commit_epoch * 1000).toISOString().slice(0, 10) + : null, + }); + if (pairs.length >= limit) break; + } + + const meta = getCoChangeMeta(db); + db.close(); + + return { pairs, meta }; +} + +/** + * Batch-query co-change partners for a set of files. + * Takes an already-open readonly DB handle (for diff-impact integration). + * @param {string[]} files - File paths to query + * @param {import('better-sqlite3').Database} db - Already-open DB handle + * @param {object} [opts] + * @param {number} [opts.minJaccard=0.3] + * @param {number} [opts.limit=20] + * @param {boolean} [opts.noTests] + * @returns {Array<{file: string, coupledWith: string, commitCount: number, jaccard: number}>} + */ +export function coChangeForFiles(files, db, opts = {}) { + const minJaccard = opts.minJaccard ?? 0.3; + const limit = opts.limit ?? 20; + const noTests = opts.noTests || false; + const inputSet = new Set(files); + + if (files.length === 0) return []; + + const placeholders = files.map(() => '?').join(','); + const rows = db + .prepare( + `SELECT file_a, file_b, commit_count, jaccard + FROM co_changes + WHERE (file_a IN (${placeholders}) OR file_b IN (${placeholders})) + AND jaccard >= ? + ORDER BY jaccard DESC + LIMIT ?`, + ) + .all(...files, ...files, minJaccard, limit); + + const results = []; + for (const row of rows) { + const partner = inputSet.has(row.file_a) ? row.file_b : row.file_a; + const source = inputSet.has(row.file_a) ? row.file_a : row.file_b; + if (inputSet.has(partner)) continue; + if (noTests && isTestFile(partner)) continue; + results.push({ + file: partner, + coupledWith: source, + commitCount: row.commit_count, + jaccard: row.jaccard, + }); + } + + return results; +} + +/** + * Format co-change data for CLI output (single file). + */ +export function formatCoChange(data) { + if (data.error) return data.error; + if (data.partners.length === 0) return `No co-change partners found for ${data.file}`; + + const lines = [`\nCo-change partners for ${data.file}:\n`]; + for (const p of data.partners) { + const pct = `${(p.jaccard * 100).toFixed(0)}%`.padStart(4); + const commits = `${p.commitCount} commits`.padStart(12); + lines.push(` ${pct} ${commits} ${p.file}`); + } + if (data.meta?.analyzedAt) { + lines.push(`\n Analyzed: ${data.meta.analyzedAt} | Window: ${data.meta.since || 'all'}`); + } + return lines.join('\n'); +} + +/** + * Format top co-change pairs for CLI output (global view). + */ +export function formatCoChangeTop(data) { + if (data.error) return data.error; + if (data.pairs.length === 0) return 'No co-change pairs found.'; + + const lines = ['\nTop co-change pairs:\n']; + for (const p of data.pairs) { + const pct = `${(p.jaccard * 100).toFixed(0)}%`.padStart(4); + const commits = `${p.commitCount} commits`.padStart(12); + lines.push(` ${pct} ${commits} ${p.fileA} <-> ${p.fileB}`); + } + if (data.meta?.analyzedAt) { + lines.push(`\n Analyzed: ${data.meta.analyzedAt} | Window: ${data.meta.since || 'all'}`); + } + return lines.join('\n'); +} + +// ─── Internal Helpers ──────────────────────────────────────────────────── + +function resolveCoChangeFile(db, file) { + // Exact match first + const exact = db + .prepare( + 'SELECT file_a FROM co_changes WHERE file_a = ? UNION SELECT file_b FROM co_changes WHERE file_b = ? LIMIT 1', + ) + .get(file, file); + if (exact) return exact.file_a; + + // Partial match (ends with) + const partial = db + .prepare( + `SELECT file_a AS file FROM co_changes WHERE file_a LIKE ? + UNION + SELECT file_b AS file FROM co_changes WHERE file_b LIKE ? + LIMIT 1`, + ) + .get(`%${file}`, `%${file}`); + if (partial) return partial.file; + + return null; +} + +function getCoChangeMeta(db) { + try { + const rows = db.prepare('SELECT key, value FROM co_change_meta').all(); + const meta = {}; + for (const row of rows) { + meta[row.key] = row.value; + } + return { + analyzedAt: meta.analyzed_at || null, + since: meta.since || null, + minSupport: meta.min_support ? parseInt(meta.min_support, 10) : null, + lastCommit: meta.last_analyzed_commit || null, + }; + } catch { + return null; + } +} diff --git a/src/config.js b/src/config.js index 5e90e5a1..9759c2d0 100644 --- a/src/config.js +++ b/src/config.js @@ -24,6 +24,12 @@ export const DEFAULTS = { llm: { provider: null, model: null, baseUrl: null, apiKey: null, apiKeyCommand: null }, search: { defaultMinScore: 0.2, rrfK: 60, topK: 15 }, ci: { failOnCycles: false, impactThreshold: null }, + coChange: { + since: '1 year ago', + minSupport: 3, + minJaccard: 0.3, + maxFilesPerCommit: 50, + }, }; /** diff --git a/src/db.js b/src/db.js index 7d4b79fe..b2dbb67a 100644 --- a/src/db.js +++ b/src/db.js @@ -71,6 +71,27 @@ export const MIGRATIONS = [ version: 4, up: `ALTER TABLE file_hashes ADD COLUMN size INTEGER DEFAULT 0;`, }, + { + version: 5, + up: ` + CREATE TABLE IF NOT EXISTS co_changes ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + file_a TEXT NOT NULL, + file_b TEXT NOT NULL, + commit_count INTEGER NOT NULL, + jaccard REAL NOT NULL, + last_commit_epoch INTEGER, + UNIQUE(file_a, file_b) + ); + CREATE INDEX IF NOT EXISTS idx_co_changes_file_a ON co_changes(file_a); + CREATE INDEX IF NOT EXISTS idx_co_changes_file_b ON co_changes(file_b); + CREATE INDEX IF NOT EXISTS idx_co_changes_jaccard ON co_changes(jaccard DESC); + CREATE TABLE IF NOT EXISTS co_change_meta ( + key TEXT PRIMARY KEY, + value TEXT NOT NULL + ); + `, + }, ]; export function openDb(dbPath) { diff --git a/src/index.js b/src/index.js index 9da90b5e..d89710d7 100644 --- a/src/index.js +++ b/src/index.js @@ -7,6 +7,15 @@ // Graph building export { buildGraph, collectFiles, loadPathAliases, resolveImportPath } from './builder.js'; +// Co-change analysis +export { + analyzeCoChanges, + coChangeData, + coChangeForFiles, + coChangeTopData, + computeCoChanges, + scanGitHistory, +} from './cochange.js'; // Configuration export { loadConfig } from './config.js'; // Shared constants diff --git a/src/mcp.js b/src/mcp.js index 2daeeb84..f5411874 100644 --- a/src/mcp.js +++ b/src/mcp.js @@ -312,6 +312,27 @@ const BASE_TOOLS = [ }, }, }, + { + name: 'co_changes', + description: + 'Find files that historically change together based on git commit history. Requires prior `codegraph co-change --analyze`.', + inputSchema: { + type: 'object', + properties: { + file: { + type: 'string', + description: 'File path (partial match). Omit for top global pairs.', + }, + limit: { type: 'number', description: 'Max results', default: 20 }, + min_jaccard: { + type: 'number', + description: 'Minimum Jaccard similarity (0-1)', + default: 0.3, + }, + no_tests: { type: 'boolean', description: 'Exclude test files', default: false }, + }, + }, + }, ]; const LIST_REPOS_TOOL = { @@ -584,6 +605,21 @@ export async function startMCPServer(customDbPath, options = {}) { }); break; } + case 'co_changes': { + const { coChangeData, coChangeTopData } = await import('./cochange.js'); + result = args.file + ? coChangeData(args.file, dbPath, { + limit: args.limit, + minJaccard: args.min_jaccard, + noTests: args.no_tests, + }) + : coChangeTopData(dbPath, { + limit: args.limit, + minJaccard: args.min_jaccard, + noTests: args.no_tests, + }); + break; + } case 'list_repos': { const { listRepos, pruneRegistry } = await import('./registry.js'); pruneRegistry(); diff --git a/src/queries.js b/src/queries.js index 3aaeaf67..13353a39 100644 --- a/src/queries.js +++ b/src/queries.js @@ -730,16 +730,51 @@ export function diffImpactData(customDbPath, opts = {}) { const affectedFiles = new Set(); for (const key of allAffected) affectedFiles.add(key.split(':')[0]); + // Look up historically coupled files from co-change data + const historicallyCoupled = []; + try { + db.prepare('SELECT 1 FROM co_changes LIMIT 1').get(); + const changedFilesList = [...changedRanges.keys()]; + const staticFiles = new Set([...changedRanges.keys(), ...affectedFiles]); + const placeholders = changedFilesList.map(() => '?').join(','); + const coRows = db + .prepare( + `SELECT file_a, file_b, commit_count, jaccard + FROM co_changes + WHERE (file_a IN (${placeholders}) OR file_b IN (${placeholders})) + AND jaccard >= 0.3 + ORDER BY jaccard DESC + LIMIT 20`, + ) + .all(...changedFilesList, ...changedFilesList); + for (const row of coRows) { + const partner = changedFilesList.includes(row.file_a) ? row.file_b : row.file_a; + const source = changedFilesList.includes(row.file_a) ? row.file_a : row.file_b; + if (!staticFiles.has(partner) && (!noTests || !isTestFile(partner))) { + historicallyCoupled.push({ + file: partner, + coupledWith: source, + jaccard: row.jaccard, + commitCount: row.commit_count, + }); + } + } + } catch { + /* co_changes table doesn't exist — skip silently */ + } + db.close(); return { changedFiles: changedRanges.size, newFiles: [...newFiles], affectedFunctions: functionResults, affectedFiles: [...affectedFiles], + historicallyCoupled, summary: { functionsChanged: affectedFunctions.length, callersAffected: allAffected.size, filesAffected: affectedFiles.size, + historicallyCoupledCount: historicallyCoupled.length, }, }; } @@ -2428,9 +2463,20 @@ export function diffImpact(customDbPath, opts = {}) { console.log(` ${kindIcon(fn.kind)} ${fn.name} -- ${fn.file}:${fn.line}`); if (fn.transitiveCallers > 0) console.log(` ^ ${fn.transitiveCallers} transitive callers`); } + if (data.historicallyCoupled && data.historicallyCoupled.length > 0) { + console.log('\n Historically coupled (not in static graph):\n'); + for (const c of data.historicallyCoupled) { + const pct = `${(c.jaccard * 100).toFixed(0)}%`; + console.log( + ` ${c.file} <- coupled with ${c.coupledWith} (${pct}, ${c.commitCount} commits)`, + ); + } + } if (data.summary) { - console.log( - `\n Summary: ${data.summary.functionsChanged} functions changed -> ${data.summary.callersAffected} callers affected across ${data.summary.filesAffected} files\n`, - ); + let summaryLine = `\n Summary: ${data.summary.functionsChanged} functions changed -> ${data.summary.callersAffected} callers affected across ${data.summary.filesAffected} files`; + if (data.summary.historicallyCoupledCount > 0) { + summaryLine += `, ${data.summary.historicallyCoupledCount} historically coupled`; + } + console.log(`${summaryLine}\n`); } } diff --git a/tests/integration/cochange.test.js b/tests/integration/cochange.test.js new file mode 100644 index 00000000..13593b7e --- /dev/null +++ b/tests/integration/cochange.test.js @@ -0,0 +1,283 @@ +/** + * Integration tests for git co-change analysis. + * + * A. computeCoChanges — pure logic, no git/DB + * B. analyzeCoChanges + query — DB integration + * C. scanGitHistory — real git repo + */ + +import { execFileSync } from 'node:child_process'; +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; +import Database from 'better-sqlite3'; +import { afterAll, beforeAll, describe, expect, test } from 'vitest'; +import { + coChangeData, + coChangeTopData, + computeCoChanges, + scanGitHistory, +} from '../../src/cochange.js'; +import { initSchema } from '../../src/db.js'; + +// ─── A. computeCoChanges (pure logic) ──────────────────────────────── + +describe('computeCoChanges', () => { + test('computes correct Jaccard for known commit sets', () => { + const commits = [ + { sha: 'a1', epoch: 1000, files: ['a.js', 'b.js'] }, + { sha: 'a2', epoch: 2000, files: ['a.js', 'b.js'] }, + { sha: 'a3', epoch: 3000, files: ['a.js', 'b.js'] }, + { sha: 'a4', epoch: 4000, files: ['a.js', 'c.js'] }, + ]; + // a.js appears in 4 commits, b.js in 3, pair(a,b) = 3 + // jaccard(a,b) = 3 / (4 + 3 - 3) = 3/4 = 0.75 + const result = computeCoChanges(commits, { minSupport: 1 }); + const abKey = 'a.js\0b.js'; + expect(result.has(abKey)).toBe(true); + expect(result.get(abKey).jaccard).toBeCloseTo(0.75); + expect(result.get(abKey).commitCount).toBe(3); + }); + + test('filters by minSupport', () => { + const commits = [ + { sha: 'a1', epoch: 1000, files: ['a.js', 'b.js'] }, + { sha: 'a2', epoch: 2000, files: ['a.js', 'b.js'] }, + { sha: 'a3', epoch: 3000, files: ['a.js', 'c.js'] }, + ]; + const result = computeCoChanges(commits, { minSupport: 3 }); + // pair(a,b) only has 2 co-occurrences, pair(a,c) only 1 + expect(result.size).toBe(0); + }); + + test('skips commits exceeding maxFilesPerCommit', () => { + const commits = [ + { sha: 'a1', epoch: 1000, files: ['a.js', 'b.js', 'c.js', 'd.js'] }, + { sha: 'a2', epoch: 2000, files: ['a.js', 'b.js'] }, + { sha: 'a3', epoch: 3000, files: ['a.js', 'b.js'] }, + { sha: 'a4', epoch: 4000, files: ['a.js', 'b.js'] }, + ]; + const result = computeCoChanges(commits, { minSupport: 3, maxFilesPerCommit: 3 }); + // First commit skipped (4 files > max 3) + // pair(a,b) = 3 from commits a2,a3,a4; a appears in 3 commits, b in 3 + // jaccard = 3/(3+3-3) = 1.0 + const abKey = 'a.js\0b.js'; + expect(result.has(abKey)).toBe(true); + expect(result.get(abKey).jaccard).toBeCloseTo(1.0); + }); + + test('enforces canonical pair ordering (file_a < file_b)', () => { + const commits = [ + { sha: 'a1', epoch: 1000, files: ['z.js', 'a.js'] }, + { sha: 'a2', epoch: 2000, files: ['z.js', 'a.js'] }, + { sha: 'a3', epoch: 3000, files: ['z.js', 'a.js'] }, + ]; + const result = computeCoChanges(commits, { minSupport: 1 }); + // Should be stored as a.js < z.js + expect(result.has('a.js\0z.js')).toBe(true); + expect(result.has('z.js\0a.js')).toBe(false); + }); + + test('empty input returns empty map', () => { + const result = computeCoChanges([], { minSupport: 1 }); + expect(result.size).toBe(0); + }); + + test('tracks lastEpoch correctly', () => { + const commits = [ + { sha: 'a1', epoch: 1000, files: ['a.js', 'b.js'] }, + { sha: 'a2', epoch: 5000, files: ['a.js', 'b.js'] }, + { sha: 'a3', epoch: 3000, files: ['a.js', 'b.js'] }, + ]; + const result = computeCoChanges(commits, { minSupport: 1 }); + expect(result.get('a.js\0b.js').lastEpoch).toBe(5000); + }); + + test('filters by knownFiles when provided', () => { + const commits = [ + { sha: 'a1', epoch: 1000, files: ['a.js', 'b.js', 'c.js'] }, + { sha: 'a2', epoch: 2000, files: ['a.js', 'b.js', 'c.js'] }, + { sha: 'a3', epoch: 3000, files: ['a.js', 'b.js', 'c.js'] }, + ]; + const knownFiles = new Set(['a.js', 'b.js']); + const result = computeCoChanges(commits, { minSupport: 1, knownFiles }); + expect(result.has('a.js\0b.js')).toBe(true); + // c.js pairs should not exist + expect(result.has('a.js\0c.js')).toBe(false); + expect(result.has('b.js\0c.js')).toBe(false); + }); +}); + +// ─── B. DB integration (coChangeData / coChangeTopData) ────────────── + +describe('coChangeData + coChangeTopData', () => { + let tmpDir, dbPath; + + beforeAll(() => { + tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-cochange-')); + const cgDir = path.join(tmpDir, '.codegraph'); + fs.mkdirSync(cgDir, { recursive: true }); + dbPath = path.join(cgDir, 'graph.db'); + + const db = new Database(dbPath); + db.pragma('journal_mode = WAL'); + initSchema(db); + + // Insert known co_changes rows + const insert = db.prepare( + 'INSERT INTO co_changes (file_a, file_b, commit_count, jaccard, last_commit_epoch) VALUES (?, ?, ?, ?, ?)', + ); + insert.run('src/a.js', 'src/b.js', 10, 0.8, 1700000000); + insert.run('src/a.js', 'src/c.js', 5, 0.5, 1690000000); + insert.run('src/b.js', 'src/c.js', 3, 0.35, 1680000000); + insert.run('src/a.js', 'tests/a.test.js', 8, 0.7, 1700000000); + insert.run('src/d.js', 'src/e.js', 2, 0.2, 1670000000); + + // Insert meta + const metaInsert = db.prepare('INSERT INTO co_change_meta (key, value) VALUES (?, ?)'); + metaInsert.run('analyzed_at', '2024-01-01T00:00:00.000Z'); + metaInsert.run('since', '1 year ago'); + + db.close(); + }); + + afterAll(() => { + fs.rmSync(tmpDir, { recursive: true, force: true }); + }); + + test('coChangeData returns correct partners sorted by jaccard', () => { + const data = coChangeData('src/a.js', dbPath); + expect(data.error).toBeUndefined(); + expect(data.file).toBe('src/a.js'); + expect(data.partners.length).toBeGreaterThanOrEqual(2); + // Sorted by jaccard desc + for (let i = 1; i < data.partners.length; i++) { + expect(data.partners[i - 1].jaccard).toBeGreaterThanOrEqual(data.partners[i].jaccard); + } + }); + + test('coChangeData partial match works', () => { + const data = coChangeData('a.js', dbPath); + expect(data.error).toBeUndefined(); + expect(data.file).toBe('src/a.js'); + }); + + test('coChangeTopData returns global top pairs', () => { + const data = coChangeTopData(dbPath, { minJaccard: 0.3 }); + expect(data.error).toBeUndefined(); + expect(data.pairs.length).toBeGreaterThanOrEqual(3); + // First pair should have highest jaccard + expect(data.pairs[0].jaccard).toBe(0.8); + }); + + test('noTests filtering works', () => { + const data = coChangeData('src/a.js', dbPath, { noTests: true }); + const testPartners = data.partners.filter((p) => p.file.includes('.test.')); + expect(testPartners.length).toBe(0); + }); + + test('limit is respected', () => { + const data = coChangeData('src/a.js', dbPath, { limit: 1 }); + expect(data.partners.length).toBeLessThanOrEqual(1); + }); + + test('minJaccard filtering works', () => { + const data = coChangeTopData(dbPath, { minJaccard: 0.6 }); + for (const p of data.pairs) { + expect(p.jaccard).toBeGreaterThanOrEqual(0.6); + } + }); + + test('returns error when table is empty and file not found', () => { + // Query for a nonexistent file + const data = coChangeData('nonexistent.js', dbPath); + expect(data.error).toBeDefined(); + }); + + test('meta is included in response', () => { + const data = coChangeTopData(dbPath); + expect(data.meta).toBeDefined(); + expect(data.meta.analyzedAt).toBe('2024-01-01T00:00:00.000Z'); + expect(data.meta.since).toBe('1 year ago'); + }); +}); + +// ─── C. scanGitHistory (real git repo) ─────────────────────────────── + +describe('scanGitHistory', () => { + let tmpDir; + + beforeAll(() => { + tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-git-')); + execFileSync('git', ['init'], { cwd: tmpDir, stdio: 'pipe' }); + execFileSync('git', ['config', 'user.email', 'test@test.com'], { cwd: tmpDir, stdio: 'pipe' }); + execFileSync('git', ['config', 'user.name', 'Test'], { cwd: tmpDir, stdio: 'pipe' }); + + // Commit 1: a.js + b.js + fs.writeFileSync(path.join(tmpDir, 'a.js'), 'export const a = 1;'); + fs.writeFileSync(path.join(tmpDir, 'b.js'), 'export const b = 2;'); + execFileSync('git', ['add', 'a.js', 'b.js'], { cwd: tmpDir, stdio: 'pipe' }); + execFileSync('git', ['commit', '-m', 'commit1', '--no-gpg-sign'], { + cwd: tmpDir, + stdio: 'pipe', + }); + + // Commit 2: a.js + c.js + fs.writeFileSync(path.join(tmpDir, 'a.js'), 'export const a = 2;'); + fs.writeFileSync(path.join(tmpDir, 'c.js'), 'export const c = 1;'); + execFileSync('git', ['add', 'a.js', 'c.js'], { cwd: tmpDir, stdio: 'pipe' }); + execFileSync('git', ['commit', '-m', 'commit2', '--no-gpg-sign'], { + cwd: tmpDir, + stdio: 'pipe', + }); + + // Commit 3: a.js + b.js + fs.writeFileSync(path.join(tmpDir, 'a.js'), 'export const a = 3;'); + fs.writeFileSync(path.join(tmpDir, 'b.js'), 'export const b = 3;'); + execFileSync('git', ['add', 'a.js', 'b.js'], { cwd: tmpDir, stdio: 'pipe' }); + execFileSync('git', ['commit', '-m', 'commit3', '--no-gpg-sign'], { + cwd: tmpDir, + stdio: 'pipe', + }); + }); + + afterAll(() => { + fs.rmSync(tmpDir, { recursive: true, force: true }); + }); + + test('returns correct number of commits and files', () => { + const { commits } = scanGitHistory(tmpDir); + expect(commits.length).toBe(3); + // Most recent commit first (git log order) + expect(commits[0].files).toContain('a.js'); + expect(commits[0].files).toContain('b.js'); + }); + + test('each commit has sha, epoch, and files', () => { + const { commits } = scanGitHistory(tmpDir); + for (const c of commits) { + expect(c.sha).toMatch(/^[a-f0-9]{40}$/); + expect(typeof c.epoch).toBe('number'); + expect(c.epoch).toBeGreaterThan(0); + expect(Array.isArray(c.files)).toBe(true); + expect(c.files.length).toBeGreaterThan(0); + } + }); + + test('incremental (afterSha) works', () => { + const { commits: all } = scanGitHistory(tmpDir); + // Get the oldest commit sha + const oldestSha = all[all.length - 1].sha; + const { commits: incremental } = scanGitHistory(tmpDir, { afterSha: oldestSha }); + // Should exclude the oldest commit + expect(incremental.length).toBe(all.length - 1); + for (const c of incremental) { + expect(c.sha).not.toBe(oldestSha); + } + }); + + test('returns empty for nonexistent repo', () => { + const { commits } = scanGitHistory('/nonexistent/path'); + expect(commits).toEqual([]); + }); +}); diff --git a/tests/unit/mcp.test.js b/tests/unit/mcp.test.js index 0f3dd77b..c6709014 100644 --- a/tests/unit/mcp.test.js +++ b/tests/unit/mcp.test.js @@ -25,6 +25,7 @@ const ALL_TOOL_NAMES = [ 'list_functions', 'structure', 'hotspots', + 'co_changes', 'node_roles', 'list_repos', ]; From aef1787258f3389fe43ab3fbe5d5914ac497f268 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 25 Feb 2026 08:58:26 -0700 Subject: [PATCH 2/3] refactor: reuse coChangeForFiles in diffImpactData Replace inline co-change SQL query with a call to the existing coChangeForFiles helper, fixing both duplicated logic and a potential empty-array SQL error (WHERE file_a IN ()). Addresses Greptile review feedback on PR #95. --- src/queries.js | 34 +++++++++------------------------- 1 file changed, 9 insertions(+), 25 deletions(-) diff --git a/src/queries.js b/src/queries.js index 13353a39..1e8288d9 100644 --- a/src/queries.js +++ b/src/queries.js @@ -1,6 +1,7 @@ import { execFileSync } from 'node:child_process'; import fs from 'node:fs'; import path from 'node:path'; +import { coChangeForFiles } from './cochange.js'; import { findCycles } from './cycles.js'; import { findDbPath, openReadonlyOrFail } from './db.js'; import { debug } from './logger.js'; @@ -731,34 +732,17 @@ export function diffImpactData(customDbPath, opts = {}) { for (const key of allAffected) affectedFiles.add(key.split(':')[0]); // Look up historically coupled files from co-change data - const historicallyCoupled = []; + let historicallyCoupled = []; try { db.prepare('SELECT 1 FROM co_changes LIMIT 1').get(); const changedFilesList = [...changedRanges.keys()]; - const staticFiles = new Set([...changedRanges.keys(), ...affectedFiles]); - const placeholders = changedFilesList.map(() => '?').join(','); - const coRows = db - .prepare( - `SELECT file_a, file_b, commit_count, jaccard - FROM co_changes - WHERE (file_a IN (${placeholders}) OR file_b IN (${placeholders})) - AND jaccard >= 0.3 - ORDER BY jaccard DESC - LIMIT 20`, - ) - .all(...changedFilesList, ...changedFilesList); - for (const row of coRows) { - const partner = changedFilesList.includes(row.file_a) ? row.file_b : row.file_a; - const source = changedFilesList.includes(row.file_a) ? row.file_a : row.file_b; - if (!staticFiles.has(partner) && (!noTests || !isTestFile(partner))) { - historicallyCoupled.push({ - file: partner, - coupledWith: source, - jaccard: row.jaccard, - commitCount: row.commit_count, - }); - } - } + const coResults = coChangeForFiles(changedFilesList, db, { + minJaccard: 0.3, + limit: 20, + noTests, + }); + // Exclude files already found via static analysis + historicallyCoupled = coResults.filter((r) => !affectedFiles.has(r.file)); } catch { /* co_changes table doesn't exist — skip silently */ } From e2a771beb4dd7636e63e45c180b76f6f0f53e3c8 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 25 Feb 2026 09:19:09 -0700 Subject: [PATCH 3/3] fix: recompute Jaccard from total file counts during incremental co-change analysis The incremental upsert was overwriting Jaccard with a value computed only from new commits, ignoring historical data. Now stores per-file commit counts in a new file_commit_counts table (migration v6) and recomputes Jaccard from the accumulated totals after each merge. --- src/cochange.js | 49 ++++++++++++++++++++++++------ src/db.js | 9 ++++++ tests/integration/cochange.test.js | 14 ++++----- 3 files changed, 55 insertions(+), 17 deletions(-) diff --git a/src/cochange.js b/src/cochange.js index 4545c23a..25954769 100644 --- a/src/cochange.js +++ b/src/cochange.js @@ -124,7 +124,7 @@ export function computeCoChanges(commits, opts = {}) { }); } - return results; + return { pairs: results, fileCommitCounts }; } /** @@ -170,6 +170,7 @@ export function analyzeCoChanges(customDbPath, opts = {}) { if (opts.full) { db.exec('DELETE FROM co_changes'); db.exec('DELETE FROM co_change_meta'); + db.exec('DELETE FROM file_commit_counts'); } // Collect known files from the graph for filtering @@ -182,25 +183,53 @@ export function analyzeCoChanges(customDbPath, opts = {}) { } const { commits } = scanGitHistory(repoRoot, { since, afterSha }); - const coChanges = computeCoChanges(commits, { minSupport, maxFilesPerCommit, knownFiles }); + const { pairs: coChanges, fileCommitCounts } = computeCoChanges(commits, { + minSupport, + maxFilesPerCommit, + knownFiles, + }); - // Write results - const upsert = db.prepare(` + // Upsert per-file commit counts so Jaccard can be recomputed from totals + const fileCountUpsert = db.prepare(` + INSERT INTO file_commit_counts (file, commit_count) VALUES (?, ?) + ON CONFLICT(file) DO UPDATE SET commit_count = commit_count + excluded.commit_count + `); + + // Upsert pair counts (accumulate commit_count, jaccard placeholder — recomputed below) + const pairUpsert = db.prepare(` INSERT INTO co_changes (file_a, file_b, commit_count, jaccard, last_commit_epoch) - VALUES (?, ?, ?, ?, ?) + VALUES (?, ?, ?, 0, ?) ON CONFLICT(file_a, file_b) DO UPDATE SET commit_count = commit_count + excluded.commit_count, - jaccard = excluded.jaccard, last_commit_epoch = MAX(co_changes.last_commit_epoch, excluded.last_commit_epoch) `); - const insertMany = db.transaction((pairs) => { - for (const [key, data] of pairs) { + const insertMany = db.transaction(() => { + for (const [file, count] of fileCommitCounts) { + fileCountUpsert.run(file, count); + } + for (const [key, data] of coChanges) { const [fileA, fileB] = key.split('\0'); - upsert.run(fileA, fileB, data.commitCount, data.jaccard, data.lastEpoch); + pairUpsert.run(fileA, fileB, data.commitCount, data.lastEpoch); } }); - insertMany(coChanges); + insertMany(); + + // Recompute Jaccard for all affected pairs from total file commit counts + const affectedFiles = [...fileCommitCounts.keys()]; + if (affectedFiles.length > 0) { + const ph = affectedFiles.map(() => '?').join(','); + db.prepare(` + UPDATE co_changes SET jaccard = ( + SELECT CAST(co_changes.commit_count AS REAL) / ( + COALESCE(fa.commit_count, 0) + COALESCE(fb.commit_count, 0) - co_changes.commit_count + ) + FROM file_commit_counts fa, file_commit_counts fb + WHERE fa.file = co_changes.file_a AND fb.file = co_changes.file_b + ) + WHERE file_a IN (${ph}) OR file_b IN (${ph}) + `).run(...affectedFiles, ...affectedFiles); + } // Update metadata const metaUpsert = db.prepare(` diff --git a/src/db.js b/src/db.js index b2dbb67a..a763a0f9 100644 --- a/src/db.js +++ b/src/db.js @@ -92,6 +92,15 @@ export const MIGRATIONS = [ ); `, }, + { + version: 6, + up: ` + CREATE TABLE IF NOT EXISTS file_commit_counts ( + file TEXT PRIMARY KEY, + commit_count INTEGER NOT NULL DEFAULT 0 + ); + `, + }, ]; export function openDb(dbPath) { diff --git a/tests/integration/cochange.test.js b/tests/integration/cochange.test.js index 13593b7e..e1a9ad51 100644 --- a/tests/integration/cochange.test.js +++ b/tests/integration/cochange.test.js @@ -32,7 +32,7 @@ describe('computeCoChanges', () => { ]; // a.js appears in 4 commits, b.js in 3, pair(a,b) = 3 // jaccard(a,b) = 3 / (4 + 3 - 3) = 3/4 = 0.75 - const result = computeCoChanges(commits, { minSupport: 1 }); + const { pairs: result } = computeCoChanges(commits, { minSupport: 1 }); const abKey = 'a.js\0b.js'; expect(result.has(abKey)).toBe(true); expect(result.get(abKey).jaccard).toBeCloseTo(0.75); @@ -45,7 +45,7 @@ describe('computeCoChanges', () => { { sha: 'a2', epoch: 2000, files: ['a.js', 'b.js'] }, { sha: 'a3', epoch: 3000, files: ['a.js', 'c.js'] }, ]; - const result = computeCoChanges(commits, { minSupport: 3 }); + const { pairs: result } = computeCoChanges(commits, { minSupport: 3 }); // pair(a,b) only has 2 co-occurrences, pair(a,c) only 1 expect(result.size).toBe(0); }); @@ -57,7 +57,7 @@ describe('computeCoChanges', () => { { sha: 'a3', epoch: 3000, files: ['a.js', 'b.js'] }, { sha: 'a4', epoch: 4000, files: ['a.js', 'b.js'] }, ]; - const result = computeCoChanges(commits, { minSupport: 3, maxFilesPerCommit: 3 }); + const { pairs: result } = computeCoChanges(commits, { minSupport: 3, maxFilesPerCommit: 3 }); // First commit skipped (4 files > max 3) // pair(a,b) = 3 from commits a2,a3,a4; a appears in 3 commits, b in 3 // jaccard = 3/(3+3-3) = 1.0 @@ -72,14 +72,14 @@ describe('computeCoChanges', () => { { sha: 'a2', epoch: 2000, files: ['z.js', 'a.js'] }, { sha: 'a3', epoch: 3000, files: ['z.js', 'a.js'] }, ]; - const result = computeCoChanges(commits, { minSupport: 1 }); + const { pairs: result } = computeCoChanges(commits, { minSupport: 1 }); // Should be stored as a.js < z.js expect(result.has('a.js\0z.js')).toBe(true); expect(result.has('z.js\0a.js')).toBe(false); }); test('empty input returns empty map', () => { - const result = computeCoChanges([], { minSupport: 1 }); + const { pairs: result } = computeCoChanges([], { minSupport: 1 }); expect(result.size).toBe(0); }); @@ -89,7 +89,7 @@ describe('computeCoChanges', () => { { sha: 'a2', epoch: 5000, files: ['a.js', 'b.js'] }, { sha: 'a3', epoch: 3000, files: ['a.js', 'b.js'] }, ]; - const result = computeCoChanges(commits, { minSupport: 1 }); + const { pairs: result } = computeCoChanges(commits, { minSupport: 1 }); expect(result.get('a.js\0b.js').lastEpoch).toBe(5000); }); @@ -100,7 +100,7 @@ describe('computeCoChanges', () => { { sha: 'a3', epoch: 3000, files: ['a.js', 'b.js', 'c.js'] }, ]; const knownFiles = new Set(['a.js', 'b.js']); - const result = computeCoChanges(commits, { minSupport: 1, knownFiles }); + const { pairs: result } = computeCoChanges(commits, { minSupport: 1, knownFiles }); expect(result.has('a.js\0b.js')).toBe(true); // c.js pairs should not exist expect(result.has('a.js\0c.js')).toBe(false);