From c21c38786f47c2c9bcbc9a188c0bffd2160f7af0 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 25 Feb 2026 02:37:50 -0700 Subject: [PATCH 1/4] chore: configure bge-large as default embedding model Sets Xenova/bge-large-en-v1.5 (1024d) as the default embedding model for codegraph self-analysis, replacing the default minilm. --- .codegraphrc.json | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .codegraphrc.json diff --git a/.codegraphrc.json b/.codegraphrc.json new file mode 100644 index 00000000..98663830 --- /dev/null +++ b/.codegraphrc.json @@ -0,0 +1,3 @@ +{ + "embeddings": { "model": "bge-large" } +} From 77ffffcac566dbb84a95bece1b3d5ae432cf01e1 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 25 Feb 2026 09:28:26 -0700 Subject: [PATCH 2/4] fix: make embed command respect config embeddings.model The CLI embed command hardcoded 'minilm' as the default model via Commander, ignoring .codegraphrc.json config entirely. Now the embed command reads config.embeddings.model as the default when no -m flag is passed. Also fixes DEFAULTS.embeddings.model from 'nomic-v1.5' to 'minilm' to match the actual fallback used by the embedder, and updates the models command to show the configured default. --- src/cli.js | 15 ++++++++------- src/config.js | 2 +- tests/unit/config.test.js | 2 +- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/cli.js b/src/cli.js index 60dbac63..2487ef75 100644 --- a/src/cli.js +++ b/src/cli.js @@ -423,12 +423,13 @@ program .command('models') .description('List available embedding models') .action(() => { + const defaultModel = config.embeddings?.model || 'minilm'; console.log('\nAvailable embedding models:\n'); - for (const [key, config] of Object.entries(MODELS)) { - const def = key === 'minilm' ? ' (default)' : ''; - const ctx = config.contextWindow ? `${config.contextWindow} ctx` : ''; + for (const [key, cfg] of Object.entries(MODELS)) { + const def = key === defaultModel ? ' (default)' : ''; + const ctx = cfg.contextWindow ? `${cfg.contextWindow} ctx` : ''; console.log( - ` ${key.padEnd(12)} ${String(config.dim).padStart(4)}d ${ctx.padEnd(9)} ${config.desc}${def}`, + ` ${key.padEnd(12)} ${String(cfg.dim).padStart(4)}d ${ctx.padEnd(9)} ${cfg.desc}${def}`, ); } console.log('\nUsage: codegraph embed --model --strategy '); @@ -442,8 +443,7 @@ program ) .option( '-m, --model ', - 'Embedding model: minilm (default), jina-small, jina-base, jina-code, nomic, nomic-v1.5, bge-large. Run `codegraph models` for details', - 'minilm', + 'Embedding model (default from config or minilm). Run `codegraph models` for details', ) .option( '-s, --strategy ', @@ -458,7 +458,8 @@ program process.exit(1); } const root = path.resolve(dir || '.'); - await buildEmbeddings(root, opts.model, undefined, { strategy: opts.strategy }); + const model = opts.model || config.embeddings?.model || 'minilm'; + await buildEmbeddings(root, model, undefined, { strategy: opts.strategy }); }); program diff --git a/src/config.js b/src/config.js index 5e90e5a1..4bee5a58 100644 --- a/src/config.js +++ b/src/config.js @@ -20,7 +20,7 @@ export const DEFAULTS = { defaultLimit: 20, excludeTests: false, }, - embeddings: { model: 'nomic-v1.5', llmProvider: null }, + embeddings: { model: 'minilm', llmProvider: null }, llm: { provider: null, model: null, baseUrl: null, apiKey: null, apiKeyCommand: null }, search: { defaultMinScore: 0.2, rrfK: 60, topK: 15 }, ci: { failOnCycles: false, impactThreshold: null }, diff --git a/tests/unit/config.test.js b/tests/unit/config.test.js index e922abe5..c005e6cb 100644 --- a/tests/unit/config.test.js +++ b/tests/unit/config.test.js @@ -55,7 +55,7 @@ describe('DEFAULTS', () => { }); it('has embeddings defaults', () => { - expect(DEFAULTS.embeddings).toEqual({ model: 'nomic-v1.5', llmProvider: null }); + expect(DEFAULTS.embeddings).toEqual({ model: 'minilm', llmProvider: null }); }); it('has llm defaults', () => { From 832fa49578bd51ffe3166cbbdc8dee71a51ef8c2 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 25 Feb 2026 09:32:56 -0700 Subject: [PATCH 3/4] fix: use DEFAULT_MODEL as single source of truth for embed default Change DEFAULT_MODEL in embedder.js from 'minilm' to 'nomic-v1.5' to match the intended default. Import DEFAULT_MODEL in cli.js instead of hardcoding strings. The embed command now resolves the model as: CLI flag > config.embeddings.model > DEFAULT_MODEL. Restores config.js DEFAULTS.embeddings.model to 'nomic-v1.5' (was incorrectly changed in prior commit). --- src/cli.js | 12 +++++++++--- src/config.js | 2 +- src/embedder.js | 2 +- tests/unit/config.test.js | 2 +- 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/cli.js b/src/cli.js index 2487ef75..e048ac44 100644 --- a/src/cli.js +++ b/src/cli.js @@ -7,7 +7,13 @@ import { buildGraph } from './builder.js'; import { loadConfig } from './config.js'; import { findCycles, formatCycles } from './cycles.js'; import { openReadonlyOrFail } from './db.js'; -import { buildEmbeddings, EMBEDDING_STRATEGIES, MODELS, search } from './embedder.js'; +import { + buildEmbeddings, + DEFAULT_MODEL, + EMBEDDING_STRATEGIES, + MODELS, + search, +} from './embedder.js'; import { exportDOT, exportJSON, exportMermaid } from './export.js'; import { setVerbose } from './logger.js'; import { @@ -423,7 +429,7 @@ program .command('models') .description('List available embedding models') .action(() => { - const defaultModel = config.embeddings?.model || 'minilm'; + const defaultModel = config.embeddings?.model || DEFAULT_MODEL; console.log('\nAvailable embedding models:\n'); for (const [key, cfg] of Object.entries(MODELS)) { const def = key === defaultModel ? ' (default)' : ''; @@ -458,7 +464,7 @@ program process.exit(1); } const root = path.resolve(dir || '.'); - const model = opts.model || config.embeddings?.model || 'minilm'; + const model = opts.model || config.embeddings?.model || DEFAULT_MODEL; await buildEmbeddings(root, model, undefined, { strategy: opts.strategy }); }); diff --git a/src/config.js b/src/config.js index 4bee5a58..5e90e5a1 100644 --- a/src/config.js +++ b/src/config.js @@ -20,7 +20,7 @@ export const DEFAULTS = { defaultLimit: 20, excludeTests: false, }, - embeddings: { model: 'minilm', llmProvider: null }, + embeddings: { model: 'nomic-v1.5', llmProvider: null }, llm: { provider: null, model: null, baseUrl: null, apiKey: null, apiKeyCommand: null }, search: { defaultMinScore: 0.2, rrfK: 60, topK: 15 }, ci: { failOnCycles: false, impactThreshold: null }, diff --git a/src/embedder.js b/src/embedder.js index 938a5976..4aba1e7d 100644 --- a/src/embedder.js +++ b/src/embedder.js @@ -98,7 +98,7 @@ export const MODELS = { export const EMBEDDING_STRATEGIES = ['structured', 'source']; -export const DEFAULT_MODEL = 'minilm'; +export const DEFAULT_MODEL = 'nomic-v1.5'; const BATCH_SIZE_MAP = { minilm: 32, 'jina-small': 16, diff --git a/tests/unit/config.test.js b/tests/unit/config.test.js index c005e6cb..e922abe5 100644 --- a/tests/unit/config.test.js +++ b/tests/unit/config.test.js @@ -55,7 +55,7 @@ describe('DEFAULTS', () => { }); it('has embeddings defaults', () => { - expect(DEFAULTS.embeddings).toEqual({ model: 'minilm', llmProvider: null }); + expect(DEFAULTS.embeddings).toEqual({ model: 'nomic-v1.5', llmProvider: null }); }); it('has llm defaults', () => { From f977f9ca230a18913de2d2696428e2a05b7dbe03 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 25 Feb 2026 10:06:22 -0700 Subject: [PATCH 4/4] docs: add co-change analysis to README and mark backlog #9 done MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Document the new git co-change analysis feature from PR #95: - Add co-change commands section, feature table row, comparison rows - Update MCP tool count 18 → 19 across all mentions - Mark BACKLOG item #9 as DONE with implementation details --- README.md | 31 +++++++++++++++++++++++++------ docs/roadmap/BACKLOG.md | 2 +- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 537fa178..9b3423fa 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ cd your-project codegraph build ``` -That's it. No config files, no Docker, no JVM, no API keys, no accounts. The graph is ready to query. Add `codegraph mcp` to your AI agent's config and it has full access to your dependency graph through 18 MCP tools. +That's it. No config files, no Docker, no JVM, no API keys, no accounts. The graph is ready to query. Add `codegraph mcp` to your AI agent's config and it has full access to your dependency graph through 19 MCP tools. ### Why it matters @@ -78,6 +78,7 @@ That's it. No config files, no Docker, no JVM, no API keys, no accounts. The gra | Semantic search | **Yes** | — | **Yes** | **Yes** | — | **Yes** | — | — | | MCP / AI agent support | **Yes** | — | **Yes** | **Yes** | **Yes** | **Yes** | **Yes** | — | | Git diff impact | **Yes** | — | — | — | — | **Yes** | — | **Yes** | +| Git co-change analysis | **Yes** | — | — | — | — | — | **Yes** | **Yes** | | Watch mode | **Yes** | — | **Yes** | — | — | — | — | — | | Dead code / role classification | **Yes** | — | **Yes** | — | — | — | — | **Yes** | | Cycle detection | **Yes** | — | **Yes** | — | — | — | — | **Yes** | @@ -96,9 +97,9 @@ That's it. No config files, no Docker, no JVM, no API keys, no accounts. The gra | **🔓** | **Zero-cost core, LLM-enhanced when you want** | Full graph analysis with no API keys, no accounts, no cost. Optionally bring your own LLM provider — your code only goes where you choose | | **🔬** | **Function-level, not just files** | Traces `handleAuth()` → `validateToken()` → `decryptJWT()` and shows 14 callers across 9 files break if `decryptJWT` changes | | **🏷️** | **Role classification** | Every symbol auto-tagged as `entry`/`core`/`utility`/`adapter`/`dead`/`leaf` — agents instantly know what they're looking at | -| **🤖** | **Built for AI agents** | 18-tool [MCP server](https://modelcontextprotocol.io/) — AI assistants query your graph directly. Single-repo by default | +| **🤖** | **Built for AI agents** | 19-tool [MCP server](https://modelcontextprotocol.io/) — AI assistants query your graph directly. Single-repo by default | | **🌐** | **Multi-language, one CLI** | JS/TS + Python + Go + Rust + Java + C# + PHP + Ruby + HCL in a single graph | -| **💥** | **Git diff impact** | `codegraph diff-impact` shows changed functions, their callers, and full blast radius — ships with a GitHub Actions workflow | +| **💥** | **Git diff impact** | `codegraph diff-impact` shows changed functions, their callers, and full blast radius — enriched with historically coupled files from git co-change analysis. Ships with a GitHub Actions workflow | | **🧠** | **Semantic search** | Local embeddings by default, LLM-powered when opted in — multi-query with RRF ranking via `"auth; token; JWT"` | --- @@ -143,7 +144,7 @@ After modifying code: Or connect directly via MCP: ```bash -codegraph mcp # 18-tool MCP server — AI queries the graph directly +codegraph mcp # 19-tool MCP server — AI queries the graph directly ``` Full agent setup: [AI Agent Guide](docs/guides/ai-agent-guide.md) · [CLAUDE.md template](docs/guides/ai-agent-guide.md#claudemd-template) @@ -161,6 +162,7 @@ Full agent setup: [AI Agent Guide](docs/guides/ai-agent-guide.md) · [CLAU | 🎯 | **Deep context** | `context` gives AI agents source, deps, callers, signature, and tests for a function in one call; `explain` gives structural summaries of files or functions | | 📍 | **Fast lookup** | `where` shows exactly where a symbol is defined and used — minimal, fast | | 📊 | **Diff impact** | Parse `git diff`, find overlapping functions, trace their callers | +| 🔗 | **Co-change analysis** | Analyze git history for files that always change together — surfaces hidden coupling the static graph can't see; enriches `diff-impact` with historically coupled files | | 🗺️ | **Module map** | Bird's-eye view of your most-connected files | | 🏗️ | **Structure & hotspots** | Directory cohesion scores, fan-in/fan-out hotspot detection, module boundaries | | 🏷️ | **Node role classification** | Every symbol auto-tagged as `entry`/`core`/`utility`/`adapter`/`dead`/`leaf` based on connectivity patterns — agents instantly know architectural role | @@ -168,7 +170,7 @@ Full agent setup: [AI Agent Guide](docs/guides/ai-agent-guide.md) · [CLAU | 📤 | **Export** | DOT (Graphviz), Mermaid, and JSON graph export | | 🧠 | **Semantic search** | Embeddings-powered natural language search with multi-query RRF ranking | | 👀 | **Watch mode** | Incrementally update the graph as files change | -| 🤖 | **MCP server** | 18-tool MCP server for AI assistants; single-repo by default, opt-in multi-repo | +| 🤖 | **MCP server** | 19-tool MCP server for AI assistants; single-repo by default, opt-in multi-repo | | ⚡ | **Always fresh** | Three-tier incremental detection — sub-second rebuilds even on large codebases | ## 📦 Commands @@ -219,6 +221,22 @@ codegraph diff-impact HEAD~3 # Impact vs a specific ref codegraph diff-impact main --format mermaid -T # Mermaid flowchart of blast radius ``` +### Co-Change Analysis + +Analyze git history to find files that always change together — surfaces hidden coupling the static graph can't see. Requires a git repository. + +```bash +codegraph co-change --analyze # Scan git history and populate co-change data +codegraph co-change src/queries.js # Show co-change partners for a file +codegraph co-change # Show top co-changing file pairs globally +codegraph co-change --since 6m # Limit to last 6 months of history +codegraph co-change --min-jaccard 0.5 # Only show strong coupling (Jaccard >= 0.5) +codegraph co-change --min-support 5 # Minimum co-commit count +codegraph co-change --full # Include all details +``` + +Co-change data also enriches `diff-impact` — historically coupled files appear in a `historicallyCoupled` section alongside the static dependency analysis. + ### Structure & Hotspots ```bash @@ -408,7 +426,7 @@ Optional: `@huggingface/transformers` (semantic search), `@modelcontextprotocol/ ### MCP Server -Codegraph includes a built-in [Model Context Protocol](https://modelcontextprotocol.io/) server with 18 tools, so AI assistants can query your dependency graph directly: +Codegraph includes a built-in [Model Context Protocol](https://modelcontextprotocol.io/) server with 19 tools, so AI assistants can query your dependency graph directly: ```bash codegraph mcp # Single-repo mode (default) — only local project @@ -595,6 +613,7 @@ const { results: fused } = await multiSearchData( | Incremental rebuilds | **O(changed)** | — | O(n) Merkle | — | — | — | | MCP / AI agent support | **Yes** | — | **Yes** | **Yes** | **Yes** | **Yes** | | Git diff impact | **Yes** | — | — | — | — | **Yes** | +| Git co-change analysis | **Yes** | — | — | — | — | — | | Dead code / role classification | **Yes** | — | **Yes** | — | — | — | | Semantic search | **Yes** | — | **Yes** | **Yes** | — | **Yes** | | Watch mode | **Yes** | — | **Yes** | — | — | — | diff --git a/docs/roadmap/BACKLOG.md b/docs/roadmap/BACKLOG.md index efd66c29..c7295544 100644 --- a/docs/roadmap/BACKLOG.md +++ b/docs/roadmap/BACKLOG.md @@ -27,7 +27,7 @@ Non-breaking, ordered by problem-fit: | ID | Title | Description | Category | Benefit | Zero-dep | Foundation-aligned | Problem-fit (1-5) | Breaking | |----|-------|-------------|----------|---------|----------|-------------------|-------------------|----------| | 4 | ~~Node classification~~ | ~~Auto-tag symbols as Entry Point / Core / Utility / Adapter based on in-degree/out-degree patterns. High fan-in + low fan-out = Core. Zero fan-in + non-export = Dead. Inspired by arbor.~~ | Intelligence | ~~Agents immediately understand architectural role of any symbol without reading surrounding code — fewer orientation tokens~~ | ✓ | ✓ | 5 | No | **DONE** — `classifyNodeRoles()` in `structure.js` auto-tags every symbol as `entry`/`core`/`utility`/`adapter`/`dead`/`leaf` using median-based fan-in/fan-out thresholds. Roles stored in DB (`role` column, migration v5), surfaced in `where`/`explain`/`context`/`stats`/`list-functions`, new `roles` CLI command, new `node_roles` MCP tool (18 tools total). Includes `--role` and `--file` filters. | -| 9 | Git change coupling | Analyze git history for files/functions that always change together. Surfaces hidden dependencies that the static graph can't see. Enhances `diff-impact` with historical co-change data. Inspired by axon. | Analysis | `diff-impact` catches more breakage by including historically coupled files; agents get a more complete blast radius picture | ✓ | ✓ | 5 | No | +| 9 | ~~Git change coupling~~ | ~~Analyze git history for files/functions that always change together. Surfaces hidden dependencies that the static graph can't see. Enhances `diff-impact` with historical co-change data. Inspired by axon.~~ | Analysis | ~~`diff-impact` catches more breakage by including historically coupled files; agents get a more complete blast radius picture~~ | ✓ | ✓ | 5 | No | **DONE** — `src/cochange.js` module with scan, compute, analyze, and query functions. DB migration v5 adds `co_changes` + `co_change_meta` tables. CLI command `codegraph co-change [file]` with `--analyze`, `--since`, `--min-support`, `--min-jaccard`, `--full` options. Integrates into `diff-impact` output via `historicallyCoupled` section. New `co_changes` MCP tool (19 tools total). Uses Jaccard similarity on commit history. | | 1 | ~~Dead code detection~~ | ~~Find symbols with zero incoming edges (excluding entry points and exports). Agents constantly ask "is this used?" — the graph already has the data, we just need to surface it. Inspired by narsil-mcp, axon, codexray, CKB.~~ | Analysis | ~~Agents stop wasting tokens investigating dead code; developers get actionable cleanup lists without external tools~~ | ✓ | ✓ | 4 | No | **DONE** — Delivered as part of node classification (ID 4). `codegraph roles --role dead -T` lists all symbols with zero fan-in that aren't exported. | | 2 | Shortest path A→B | BFS/Dijkstra on the existing edges table to find how symbol A reaches symbol B. We have `fn` for single-node chains but no A→B pathfinding. Inspired by codexray, arbor. | Navigation | Agents can answer "how does this function reach that one?" in one call instead of manually tracing chains | ✓ | ✓ | 4 | No | | 12 | Execution flow tracing | Framework-aware entry point detection (Express routes, CLI commands, event handlers) + BFS flow tracing from entry to leaf. Inspired by axon, GitNexus, code-context-mcp. | Navigation | Agents can answer "what happens when a user hits POST /login?" by tracing the full execution path in one query | ✓ | ✓ | 4 | No |