diff --git a/.codegraphrc.json b/.codegraphrc.json new file mode 100644 index 00000000..98663830 --- /dev/null +++ b/.codegraphrc.json @@ -0,0 +1,3 @@ +{ + "embeddings": { "model": "bge-large" } +} diff --git a/README.md b/README.md index 537fa178..9b3423fa 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ cd your-project codegraph build ``` -That's it. No config files, no Docker, no JVM, no API keys, no accounts. The graph is ready to query. Add `codegraph mcp` to your AI agent's config and it has full access to your dependency graph through 18 MCP tools. +That's it. No config files, no Docker, no JVM, no API keys, no accounts. The graph is ready to query. Add `codegraph mcp` to your AI agent's config and it has full access to your dependency graph through 19 MCP tools. ### Why it matters @@ -78,6 +78,7 @@ That's it. No config files, no Docker, no JVM, no API keys, no accounts. The gra | Semantic search | **Yes** | — | **Yes** | **Yes** | — | **Yes** | — | — | | MCP / AI agent support | **Yes** | — | **Yes** | **Yes** | **Yes** | **Yes** | **Yes** | — | | Git diff impact | **Yes** | — | — | — | — | **Yes** | — | **Yes** | +| Git co-change analysis | **Yes** | — | — | — | — | — | **Yes** | **Yes** | | Watch mode | **Yes** | — | **Yes** | — | — | — | — | — | | Dead code / role classification | **Yes** | — | **Yes** | — | — | — | — | **Yes** | | Cycle detection | **Yes** | — | **Yes** | — | — | — | — | **Yes** | @@ -96,9 +97,9 @@ That's it. No config files, no Docker, no JVM, no API keys, no accounts. The gra | **🔓** | **Zero-cost core, LLM-enhanced when you want** | Full graph analysis with no API keys, no accounts, no cost. Optionally bring your own LLM provider — your code only goes where you choose | | **🔬** | **Function-level, not just files** | Traces `handleAuth()` → `validateToken()` → `decryptJWT()` and shows 14 callers across 9 files break if `decryptJWT` changes | | **🏷️** | **Role classification** | Every symbol auto-tagged as `entry`/`core`/`utility`/`adapter`/`dead`/`leaf` — agents instantly know what they're looking at | -| **🤖** | **Built for AI agents** | 18-tool [MCP server](https://modelcontextprotocol.io/) — AI assistants query your graph directly. Single-repo by default | +| **🤖** | **Built for AI agents** | 19-tool [MCP server](https://modelcontextprotocol.io/) — AI assistants query your graph directly. Single-repo by default | | **🌐** | **Multi-language, one CLI** | JS/TS + Python + Go + Rust + Java + C# + PHP + Ruby + HCL in a single graph | -| **💥** | **Git diff impact** | `codegraph diff-impact` shows changed functions, their callers, and full blast radius — ships with a GitHub Actions workflow | +| **💥** | **Git diff impact** | `codegraph diff-impact` shows changed functions, their callers, and full blast radius — enriched with historically coupled files from git co-change analysis. Ships with a GitHub Actions workflow | | **🧠** | **Semantic search** | Local embeddings by default, LLM-powered when opted in — multi-query with RRF ranking via `"auth; token; JWT"` | --- @@ -143,7 +144,7 @@ After modifying code: Or connect directly via MCP: ```bash -codegraph mcp # 18-tool MCP server — AI queries the graph directly +codegraph mcp # 19-tool MCP server — AI queries the graph directly ``` Full agent setup: [AI Agent Guide](docs/guides/ai-agent-guide.md) · [CLAUDE.md template](docs/guides/ai-agent-guide.md#claudemd-template) @@ -161,6 +162,7 @@ Full agent setup: [AI Agent Guide](docs/guides/ai-agent-guide.md) · [CLAU | 🎯 | **Deep context** | `context` gives AI agents source, deps, callers, signature, and tests for a function in one call; `explain` gives structural summaries of files or functions | | 📍 | **Fast lookup** | `where` shows exactly where a symbol is defined and used — minimal, fast | | 📊 | **Diff impact** | Parse `git diff`, find overlapping functions, trace their callers | +| 🔗 | **Co-change analysis** | Analyze git history for files that always change together — surfaces hidden coupling the static graph can't see; enriches `diff-impact` with historically coupled files | | 🗺️ | **Module map** | Bird's-eye view of your most-connected files | | 🏗️ | **Structure & hotspots** | Directory cohesion scores, fan-in/fan-out hotspot detection, module boundaries | | 🏷️ | **Node role classification** | Every symbol auto-tagged as `entry`/`core`/`utility`/`adapter`/`dead`/`leaf` based on connectivity patterns — agents instantly know architectural role | @@ -168,7 +170,7 @@ Full agent setup: [AI Agent Guide](docs/guides/ai-agent-guide.md) · [CLAU | 📤 | **Export** | DOT (Graphviz), Mermaid, and JSON graph export | | 🧠 | **Semantic search** | Embeddings-powered natural language search with multi-query RRF ranking | | 👀 | **Watch mode** | Incrementally update the graph as files change | -| 🤖 | **MCP server** | 18-tool MCP server for AI assistants; single-repo by default, opt-in multi-repo | +| 🤖 | **MCP server** | 19-tool MCP server for AI assistants; single-repo by default, opt-in multi-repo | | ⚡ | **Always fresh** | Three-tier incremental detection — sub-second rebuilds even on large codebases | ## 📦 Commands @@ -219,6 +221,22 @@ codegraph diff-impact HEAD~3 # Impact vs a specific ref codegraph diff-impact main --format mermaid -T # Mermaid flowchart of blast radius ``` +### Co-Change Analysis + +Analyze git history to find files that always change together — surfaces hidden coupling the static graph can't see. Requires a git repository. + +```bash +codegraph co-change --analyze # Scan git history and populate co-change data +codegraph co-change src/queries.js # Show co-change partners for a file +codegraph co-change # Show top co-changing file pairs globally +codegraph co-change --since 6m # Limit to last 6 months of history +codegraph co-change --min-jaccard 0.5 # Only show strong coupling (Jaccard >= 0.5) +codegraph co-change --min-support 5 # Minimum co-commit count +codegraph co-change --full # Include all details +``` + +Co-change data also enriches `diff-impact` — historically coupled files appear in a `historicallyCoupled` section alongside the static dependency analysis. + ### Structure & Hotspots ```bash @@ -408,7 +426,7 @@ Optional: `@huggingface/transformers` (semantic search), `@modelcontextprotocol/ ### MCP Server -Codegraph includes a built-in [Model Context Protocol](https://modelcontextprotocol.io/) server with 18 tools, so AI assistants can query your dependency graph directly: +Codegraph includes a built-in [Model Context Protocol](https://modelcontextprotocol.io/) server with 19 tools, so AI assistants can query your dependency graph directly: ```bash codegraph mcp # Single-repo mode (default) — only local project @@ -595,6 +613,7 @@ const { results: fused } = await multiSearchData( | Incremental rebuilds | **O(changed)** | — | O(n) Merkle | — | — | — | | MCP / AI agent support | **Yes** | — | **Yes** | **Yes** | **Yes** | **Yes** | | Git diff impact | **Yes** | — | — | — | — | **Yes** | +| Git co-change analysis | **Yes** | — | — | — | — | — | | Dead code / role classification | **Yes** | — | **Yes** | — | — | — | | Semantic search | **Yes** | — | **Yes** | **Yes** | — | **Yes** | | Watch mode | **Yes** | — | **Yes** | — | — | — | diff --git a/docs/roadmap/BACKLOG.md b/docs/roadmap/BACKLOG.md index efd66c29..c7295544 100644 --- a/docs/roadmap/BACKLOG.md +++ b/docs/roadmap/BACKLOG.md @@ -27,7 +27,7 @@ Non-breaking, ordered by problem-fit: | ID | Title | Description | Category | Benefit | Zero-dep | Foundation-aligned | Problem-fit (1-5) | Breaking | |----|-------|-------------|----------|---------|----------|-------------------|-------------------|----------| | 4 | ~~Node classification~~ | ~~Auto-tag symbols as Entry Point / Core / Utility / Adapter based on in-degree/out-degree patterns. High fan-in + low fan-out = Core. Zero fan-in + non-export = Dead. Inspired by arbor.~~ | Intelligence | ~~Agents immediately understand architectural role of any symbol without reading surrounding code — fewer orientation tokens~~ | ✓ | ✓ | 5 | No | **DONE** — `classifyNodeRoles()` in `structure.js` auto-tags every symbol as `entry`/`core`/`utility`/`adapter`/`dead`/`leaf` using median-based fan-in/fan-out thresholds. Roles stored in DB (`role` column, migration v5), surfaced in `where`/`explain`/`context`/`stats`/`list-functions`, new `roles` CLI command, new `node_roles` MCP tool (18 tools total). Includes `--role` and `--file` filters. | -| 9 | Git change coupling | Analyze git history for files/functions that always change together. Surfaces hidden dependencies that the static graph can't see. Enhances `diff-impact` with historical co-change data. Inspired by axon. | Analysis | `diff-impact` catches more breakage by including historically coupled files; agents get a more complete blast radius picture | ✓ | ✓ | 5 | No | +| 9 | ~~Git change coupling~~ | ~~Analyze git history for files/functions that always change together. Surfaces hidden dependencies that the static graph can't see. Enhances `diff-impact` with historical co-change data. Inspired by axon.~~ | Analysis | ~~`diff-impact` catches more breakage by including historically coupled files; agents get a more complete blast radius picture~~ | ✓ | ✓ | 5 | No | **DONE** — `src/cochange.js` module with scan, compute, analyze, and query functions. DB migration v5 adds `co_changes` + `co_change_meta` tables. CLI command `codegraph co-change [file]` with `--analyze`, `--since`, `--min-support`, `--min-jaccard`, `--full` options. Integrates into `diff-impact` output via `historicallyCoupled` section. New `co_changes` MCP tool (19 tools total). Uses Jaccard similarity on commit history. | | 1 | ~~Dead code detection~~ | ~~Find symbols with zero incoming edges (excluding entry points and exports). Agents constantly ask "is this used?" — the graph already has the data, we just need to surface it. Inspired by narsil-mcp, axon, codexray, CKB.~~ | Analysis | ~~Agents stop wasting tokens investigating dead code; developers get actionable cleanup lists without external tools~~ | ✓ | ✓ | 4 | No | **DONE** — Delivered as part of node classification (ID 4). `codegraph roles --role dead -T` lists all symbols with zero fan-in that aren't exported. | | 2 | Shortest path A→B | BFS/Dijkstra on the existing edges table to find how symbol A reaches symbol B. We have `fn` for single-node chains but no A→B pathfinding. Inspired by codexray, arbor. | Navigation | Agents can answer "how does this function reach that one?" in one call instead of manually tracing chains | ✓ | ✓ | 4 | No | | 12 | Execution flow tracing | Framework-aware entry point detection (Express routes, CLI commands, event handlers) + BFS flow tracing from entry to leaf. Inspired by axon, GitNexus, code-context-mcp. | Navigation | Agents can answer "what happens when a user hits POST /login?" by tracing the full execution path in one query | ✓ | ✓ | 4 | No | diff --git a/src/cli.js b/src/cli.js index 60dbac63..e048ac44 100644 --- a/src/cli.js +++ b/src/cli.js @@ -7,7 +7,13 @@ import { buildGraph } from './builder.js'; import { loadConfig } from './config.js'; import { findCycles, formatCycles } from './cycles.js'; import { openReadonlyOrFail } from './db.js'; -import { buildEmbeddings, EMBEDDING_STRATEGIES, MODELS, search } from './embedder.js'; +import { + buildEmbeddings, + DEFAULT_MODEL, + EMBEDDING_STRATEGIES, + MODELS, + search, +} from './embedder.js'; import { exportDOT, exportJSON, exportMermaid } from './export.js'; import { setVerbose } from './logger.js'; import { @@ -423,12 +429,13 @@ program .command('models') .description('List available embedding models') .action(() => { + const defaultModel = config.embeddings?.model || DEFAULT_MODEL; console.log('\nAvailable embedding models:\n'); - for (const [key, config] of Object.entries(MODELS)) { - const def = key === 'minilm' ? ' (default)' : ''; - const ctx = config.contextWindow ? `${config.contextWindow} ctx` : ''; + for (const [key, cfg] of Object.entries(MODELS)) { + const def = key === defaultModel ? ' (default)' : ''; + const ctx = cfg.contextWindow ? `${cfg.contextWindow} ctx` : ''; console.log( - ` ${key.padEnd(12)} ${String(config.dim).padStart(4)}d ${ctx.padEnd(9)} ${config.desc}${def}`, + ` ${key.padEnd(12)} ${String(cfg.dim).padStart(4)}d ${ctx.padEnd(9)} ${cfg.desc}${def}`, ); } console.log('\nUsage: codegraph embed --model --strategy '); @@ -442,8 +449,7 @@ program ) .option( '-m, --model ', - 'Embedding model: minilm (default), jina-small, jina-base, jina-code, nomic, nomic-v1.5, bge-large. Run `codegraph models` for details', - 'minilm', + 'Embedding model (default from config or minilm). Run `codegraph models` for details', ) .option( '-s, --strategy ', @@ -458,7 +464,8 @@ program process.exit(1); } const root = path.resolve(dir || '.'); - await buildEmbeddings(root, opts.model, undefined, { strategy: opts.strategy }); + const model = opts.model || config.embeddings?.model || DEFAULT_MODEL; + await buildEmbeddings(root, model, undefined, { strategy: opts.strategy }); }); program diff --git a/src/embedder.js b/src/embedder.js index 938a5976..4aba1e7d 100644 --- a/src/embedder.js +++ b/src/embedder.js @@ -98,7 +98,7 @@ export const MODELS = { export const EMBEDDING_STRATEGIES = ['structured', 'source']; -export const DEFAULT_MODEL = 'minilm'; +export const DEFAULT_MODEL = 'nomic-v1.5'; const BATCH_SIZE_MAP = { minilm: 32, 'jina-small': 16,